diff --git a/.gitattributes b/.gitattributes index 14fd1daeccb758c4fa9beb076130b4364e760d1b..3506abb69cbfd2afbb4fac14af8926e75c846185 100644 --- a/.gitattributes +++ b/.gitattributes @@ -152,3 +152,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..870a0747579312d2f3312ac1b311026efe32c129 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0b1a74e1674205ec83807b353da73daa79d781531cd64ecbd818fd5438ec680 +size 255996 diff --git a/.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..910f857895c13a479c5b69215eebf2b30290bcf9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a217bcdb2fd53d64e0014e4fd153627ade902228eadc09fe7df65ee93c07bc05 +size 160644 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..29c77f6219805d9ca6f73284a3e2a0f9dfeed269 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py @@ -0,0 +1,3 @@ +from ray.data._internal.block_batching.block_batching import batch_blocks + +__all__ = ["batch_blocks"] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5542241a2c1020046e1542188f0c393b406b69cb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b136d6e1093a7340af878ab930c65c9df0be8c6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..327b27fda5d9394bb4502b7e36eb4b0b97539d53 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc9f929b87e420e6a37ce321cca7a4c1d1b8b2c7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f52e0e35c42ff0db2225ed997008f3de0105f4b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py new file mode 100644 index 0000000000000000000000000000000000000000..39bd5f4ad2dabf9f9f0c81649854e790ab0ae81a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py @@ -0,0 +1,322 @@ +import collections +from contextlib import nullcontext +from typing import Any, Callable, Dict, Iterator, Optional + +import ray +from ray.data._internal.block_batching.interfaces import Batch, BlockPrefetcher +from ray.data._internal.block_batching.util import ( + ActorBlockPrefetcher, + WaitBlockPrefetcher, + blocks_to_batches, + collate, + extract_data_from_batch, + finalize_batches, + format_batches, + resolve_block_refs, +) +from ray.data._internal.execution.interfaces.ref_bundle import RefBundle +from ray.data._internal.memory_tracing import trace_deallocation +from ray.data._internal.stats import DatasetStats +from ray.data._internal.util import make_async_gen +from ray.data.block import Block, DataBatch +from ray.data.context import DataContext +from ray.types import ObjectRef + + +def iter_batches( + ref_bundles: Iterator[RefBundle], + *, + stats: Optional[DatasetStats] = None, + clear_block_after_read: bool = False, + batch_size: Optional[int] = None, + batch_format: Optional[str] = "default", + drop_last: bool = False, + collate_fn: Optional[Callable[[DataBatch], Any]] = None, + finalize_fn: Optional[Callable[[Any], Any]] = None, + shuffle_buffer_min_size: Optional[int] = None, + shuffle_seed: Optional[int] = None, + ensure_copy: bool = False, + prefetch_batches: int = 1, +) -> Iterator[DataBatch]: + """Create formatted batches of data from an iterator of block object references and + corresponding metadata. + + This takes a block iterator and creates batch_size batches, slicing, + unioning, shuffling, prefetching, and formatting blocks as needed. + + The algorithm uses both pipeline parallelism and data parallelism: + + If prefetch_batches=2, these are all the batches in flight: + + [User thread] trains on Batch 0 + - [Fetch thread] Batch 1 finalization + move to output queue + - [Worker thread 1] Batch 2 formatting + collating + - [Worker thread 2] Batch 3 formatting + collating + - [Raylet] Batches 4 + 5 fetched to local object store memory + + At any point in time there are prefetch_batches+1 batches in local heap memory. + And the next set of prefetch_batches in local object store memory. + + The actual steps are as follows: + + In a single async thread, do the following: + 1. Trigger Ray local prefetching of `prefetch_batches` worth of block object + references. + 2. Resolve (i.e. call `ray.get()`) on the block references. + 3. Perform the necessary batch slicing to construct full batches, possibly + shuffling if necessary. + 4. Then, in a threadpool consisting of `prefetch_batches` threads: + a. Format the batches to the provided batch format. + b. Apply the collate function. + 5. Finalize each of the collated batches + 6. Fetch outputs from the threadpool, maintaining order of the batches. + + Args: + ref_bundles: An iterator over RefBundles. + stats: DatasetStats object to record timing and other statistics. + clear_block_after_read: Whether to clear the block from object store + manually (i.e. without waiting for Python's automatic GC) after it + is read. Doing so will reclaim memory faster and hence reduce the + memory footprint. However, the caller has to ensure the safety, i.e. + the block will never be accessed again. + batch_size: Record batch size, or None to let the system pick. + batch_format: The format in which to return each batch. + Specify "default" to use the current block format (promoting + Arrow to pandas automatically), "pandas" to + select ``pandas.DataFrame`` or "pyarrow" to select + ``pyarrow.Table``, or None to use entire blocks + as batches. Default is "default". + drop_last: Whether to drop the last batch if it's incomplete. + collate_fn: A function to apply to each data batch before returning it. + finalize_fn: A function to apply to each data batch after it has been collated. + This function is not run in a threadpool so it can be used for + memory-intensive operations such as GPU preloading. + shuffle_buffer_min_size: If non-None, the data will be randomly shuffled using a + local in-memory shuffle buffer, and this value will serve as the minimum + number of rows that must be in the local in-memory shuffle buffer in order + to yield a batch. + shuffle_seed: The seed to use for the local random shuffle. + ensure_copy: Whether batches are always copied from the underlying base + blocks (not zero-copy views). + prefetch_batches: The number of batches to fetch ahead of the current batch to + process. If set to greater than 0, a separate thread will be used to fetch + the specified amount of formatted batches from blocks. This improves + performance for non-CPU bound UDFs, allowing batch fetching compute and + formatting to be overlapped with the UDF. Defaults to 1. + + Returns: + An iterator over record batches. + """ + context = DataContext.get_current() + + if ( + prefetch_batches > 0 + and context.actor_prefetcher_enabled + and not ray.util.client.ray.is_connected() + ): + prefetcher = ActorBlockPrefetcher() + else: + prefetcher = WaitBlockPrefetcher() + + eager_free = clear_block_after_read and DataContext.get_current().eager_free + + def _async_iter_batches( + ref_bundles: Iterator[RefBundle], + ) -> Iterator[DataBatch]: + # Step 1: Prefetch logical batches locally. + block_iter = prefetch_batches_locally( + ref_bundles=ref_bundles, + prefetcher=prefetcher, + num_batches_to_prefetch=prefetch_batches, + batch_size=batch_size, + eager_free=eager_free, + ) + + # Step 2: Resolve the blocks. + block_iter = resolve_block_refs(block_ref_iter=block_iter, stats=stats) + + # Step 3: Batch and shuffle the resolved blocks. + batch_iter = blocks_to_batches( + block_iter=block_iter, + stats=stats, + batch_size=batch_size, + drop_last=drop_last, + shuffle_buffer_min_size=shuffle_buffer_min_size, + shuffle_seed=shuffle_seed, + ensure_copy=ensure_copy, + ) + + # Step 4: Use a threadpool for formatting and collation. + batch_iter = _format_in_threadpool( + batch_iter, + stats=stats, + batch_format=batch_format, + collate_fn=collate_fn, + num_threadpool_workers=prefetch_batches, + ) + + # Step 5: Finalize each batch. + if finalize_fn is not None: + batch_iter = finalize_batches( + batch_iter, finalize_fn=finalize_fn, stats=stats + ) + + # Step 6: Restore original order. + batch_iter: Iterator[Batch] = restore_original_order(batch_iter) + + yield from extract_data_from_batch(batch_iter) + + # Run everything in a separate thread to not block the main thread when waiting + # for streaming results. + async_batch_iter = make_async_gen( + ref_bundles, fn=_async_iter_batches, num_workers=1 + ) + + while True: + with stats.iter_total_blocked_s.timer() if stats else nullcontext(): + try: + next_batch = next(async_batch_iter) + except StopIteration: + break + with stats.iter_user_s.timer() if stats else nullcontext(): + yield next_batch + + +def _format_in_threadpool( + batch_iter: Iterator[Batch], + stats: DatasetStats, + batch_format: Optional[str], + collate_fn: Optional[Callable[[DataBatch], Any]], + num_threadpool_workers: int, +) -> Iterator[Batch]: + """Executes the batching, formatting, and collation logic in a threadpool. + + Args: + logical_batch_iterator: An iterator over logical batches. + stats: DatasetStats object to record timing and other statistics. + batch_format: The format in which to return each batch. + Specify "default" to use the current block format (promoting + Arrow to pandas automatically), "pandas" to + select ``pandas.DataFrame`` or "pyarrow" to select + ``pyarrow.Table``, or None to use entire blocks + as batches. + collate_fn: A function to apply to each data batch before returning it. + num_threadpool_workers: The number of threads to use in the threadpool. + """ + + def threadpool_computations_format_collate( + batch_iter: Iterator[Batch], + ) -> Iterator[Batch]: + # Step 4a: Format the batches. + formatted_batch_iter = format_batches( + batch_iter, batch_format=batch_format, stats=stats + ) + + # Step 4b: Apply the collate function if applicable. + if collate_fn is not None: + formatted_batch_iter = collate( + formatted_batch_iter, collate_fn=collate_fn, stats=stats + ) + yield from formatted_batch_iter + + if num_threadpool_workers > 0: + collated_iter = make_async_gen( + base_iterator=batch_iter, + fn=threadpool_computations_format_collate, + num_workers=num_threadpool_workers, + ) + else: + collated_iter = threadpool_computations_format_collate(batch_iter) + return collated_iter + + +def prefetch_batches_locally( + ref_bundles: Iterator[RefBundle], + prefetcher: BlockPrefetcher, + num_batches_to_prefetch: int, + batch_size: Optional[int], + eager_free: bool = False, +) -> Iterator[ObjectRef[Block]]: + """Given an iterator of batched RefBundles, returns an iterator over the + corresponding block references while prefetching `num_batches_to_prefetch` + batches in advance. + + Args: + ref_bundles: An iterator over batched RefBundles. + prefetcher: The prefetcher to use. + num_batches_to_prefetch: The number of batches to prefetch ahead of the + current batch during the scan. + batch_size: User specified batch size, or None to let the system pick. + eager_free: Whether to eagerly free the object reference from the object store. + """ + + sliding_window = collections.deque() + current_window_size = 0 + + if num_batches_to_prefetch <= 0: + for ref_bundle in ref_bundles: + for block_ref in ref_bundle.block_refs: + yield block_ref + return + + if batch_size is not None: + num_rows_to_prefetch = num_batches_to_prefetch * batch_size + else: + num_rows_to_prefetch = None + + # Create and fetch the initial window. + # Stop adding if the number of rows in this window is greater than requested + # batch size, or if the batch size is None and the number of blocks in this window + # is greater than requested batches to prefetch. + while (batch_size is not None and current_window_size < num_rows_to_prefetch) or ( + batch_size is None and len(sliding_window) < num_batches_to_prefetch + ): + try: + next_ref_bundle = next(ref_bundles) + sliding_window.extend(next_ref_bundle.blocks) + current_window_size += next_ref_bundle.num_rows() + except StopIteration: + break + + prefetcher.prefetch_blocks([block_ref for block_ref, _ in list(sliding_window)]) + + while sliding_window: + block_ref, metadata = sliding_window.popleft() + current_window_size -= metadata.num_rows + if batch_size is None or current_window_size < num_rows_to_prefetch: + try: + next_ref_bundle = next(ref_bundles) + for block_ref_and_md in next_ref_bundle.blocks: + sliding_window.append(block_ref_and_md) + current_window_size += block_ref_and_md[1].num_rows + prefetcher.prefetch_blocks( + [block_ref for block_ref, _ in list(sliding_window)] + ) + except StopIteration: + pass + yield block_ref + trace_deallocation(block_ref, loc="iter_batches", free=eager_free) + prefetcher.stop() + + +def restore_original_order(batch_iter: Iterator[Batch]) -> Iterator[Batch]: + """Restores the original order of the provided `batch_iter` + + This function will yield items from `base_iterator` in the correct order based on + each batch's batch_idx. All indexes are expected to be unique. + + `batch_iter` is expected to not have any missing indexes. All indexes from 0 to len + (base_iterator) must be present. + """ + next_index_required = 0 + buffer: Dict[int, Batch] = {} + for batch in batch_iter: + assert batch.batch_idx not in buffer + buffer[batch.batch_idx] = batch + while next_index_required in buffer: + yield buffer.pop(next_index_required) + next_index_required += 1 + + while next_index_required in buffer: + yield buffer.pop(next_index_required) + next_index_required += 1 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py new file mode 100644 index 0000000000000000000000000000000000000000..4cea60abca8011ade373d0ed5ba54e060aa57b7e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py @@ -0,0 +1,293 @@ +import logging +import threading +from contextlib import nullcontext +from typing import Any, Callable, Iterator, List, Optional, Tuple + +import ray +from ray.actor import ActorHandle +from ray.data._internal.batcher import Batcher, ShufflingBatcher +from ray.data._internal.block_batching.interfaces import ( + Batch, + BlockPrefetcher, + CollatedBatch, +) +from ray.data._internal.stats import DatasetStats +from ray.data.block import Block, BlockAccessor, DataBatch +from ray.types import ObjectRef +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + +logger = logging.getLogger(__name__) + + +def _calculate_ref_hits(refs: List[ObjectRef[Any]]) -> Tuple[int, int, int]: + """Given a list of object references, returns how many are already on the local + node, how many require fetching from another node, and how many have unknown + locations. If `DataContext.get_current().enable_get_object_locations_for_metrics` is + False, this will return `(-1, -1, -1)` as getting object locations is disabled.""" + current_node_id = ray.get_runtime_context().get_node_id() + + ctx = ray.data.context.DataContext.get_current() + if ctx.enable_get_object_locations_for_metrics: + locs = ray.experimental.get_object_locations(refs) + nodes: List[List[str]] = [loc["node_ids"] for loc in locs.values()] + hits = sum(current_node_id in node_ids for node_ids in nodes) + unknowns = sum(1 for node_ids in nodes if not node_ids) + misses = len(nodes) - hits - unknowns + return hits, misses, unknowns + + return -1, -1, -1 + + +def resolve_block_refs( + block_ref_iter: Iterator[ObjectRef[Block]], + stats: Optional[DatasetStats] = None, +) -> Iterator[Block]: + """Resolves the block references for each logical batch. + + Args: + block_ref_iter: An iterator over block object references. + stats: An optional stats object to recording block hits and misses. + """ + hits = 0 + misses = 0 + unknowns = 0 + + for block_ref in block_ref_iter: + current_hit, current_miss, current_unknown = _calculate_ref_hits([block_ref]) + hits += current_hit + misses += current_miss + unknowns += current_unknown + + # TODO(amogkam): Optimized further by batching multiple references in a single + # `ray.get()` call. + with stats.iter_get_s.timer() if stats else nullcontext(): + block = ray.get(block_ref) + yield block + + if stats: + stats.iter_blocks_local = hits + stats.iter_blocks_remote = misses + stats.iter_unknown_location = unknowns + + +def blocks_to_batches( + block_iter: Iterator[Block], + stats: Optional[DatasetStats] = None, + batch_size: Optional[int] = None, + drop_last: bool = False, + shuffle_buffer_min_size: Optional[int] = None, + shuffle_seed: Optional[int] = None, + ensure_copy: bool = False, +) -> Iterator[Batch]: + """Given an iterator over blocks, returns an iterator over blocks + of the appropriate bacth size. + + If the shuffling configurations are specified, then the + output blocks contain shuffled data. + + Args: + block_iter: An iterator over blocks. + stats: Dataset stats object used to store block batching time. + batch_size: Record batch size, or None to let the system pick. + drop_last: Whether to drop the last batch if it's incomplete. + shuffle_buffer_min_size: If non-None, the data will be randomly shuffled + using a local in-memory shuffle buffer, and this value will serve as the + minimum number of rows that must be in the local in-memory shuffle buffer in + order to yield a batch. + shuffle_seed: The seed to use for the local random shuffle. + ensure_copy: Whether batches are always copied from the underlying base + blocks (not zero-copy views). + + Returns: + An iterator over blocks of the given size that are potentially shuffled. + """ + if shuffle_buffer_min_size is not None: + batcher = ShufflingBatcher( + batch_size=batch_size, + shuffle_buffer_min_size=shuffle_buffer_min_size, + shuffle_seed=shuffle_seed, + ) + else: + batcher = Batcher(batch_size=batch_size, ensure_copy=ensure_copy) + + def get_iter_next_batch_s_timer(): + return stats.iter_next_batch_s.timer() if stats else nullcontext() + + global_counter = 0 + + for block in block_iter: + batcher.add(block) + while batcher.has_batch(): + with get_iter_next_batch_s_timer(): + batch = batcher.next_batch() + yield Batch(global_counter, batch) + global_counter += 1 + + # Signal to the batcher that there are no more blocks to add. + batcher.done_adding() + + # Get any leftover batches in ShufflingBatcher. + while batcher.has_batch(): + with get_iter_next_batch_s_timer(): + batch = batcher.next_batch() + yield Batch(global_counter, batch) + global_counter += 1 + + # Get any remaining data. + if not drop_last and batcher.has_any(): + with get_iter_next_batch_s_timer(): + batch = batcher.next_batch() + yield Batch(global_counter, batch) + global_counter += 1 + + +def format_batches( + block_iter: Iterator[Batch], + batch_format: Optional[str], + stats: Optional[DatasetStats] = None, +) -> Iterator[Batch]: + """Given an iterator of blocks, returns an iterator of formatted batches. + + Args: + block_iter: An iterator over blocks. + batch_format: The batch format to use. + stats: An optional stats object to record formatting times. + + Returns: + An iterator over batch index and the formatted batch. + """ + for batch in block_iter: + with stats.iter_format_batch_s.timer() if stats else nullcontext(): + formatted_batch = BlockAccessor.for_block(batch.data).to_batch_format( + batch_format + ) + yield Batch(batch.batch_idx, formatted_batch) + + +def collate( + batch_iter: Iterator[Batch], + collate_fn: Optional[Callable[[DataBatch], Any]], + stats: Optional[DatasetStats] = None, +) -> Iterator[CollatedBatch]: + """Returns an iterator with the provided collate_fn applied to items of the batch + iterator. + + Args: + batch_iter: An iterator over formatted batches. + collate_fn: A function to apply to each batch. + stats: An optional stats object to record formatting times. + """ + for batch in batch_iter: + with stats.iter_collate_batch_s.timer() if stats else nullcontext(): + collated_batch = collate_fn(batch.data) + yield CollatedBatch(batch.batch_idx, collated_batch) + + +def finalize_batches( + batch_iter: Iterator[CollatedBatch], + finalize_fn: Callable[[Any], Any], + stats: Optional[DatasetStats] = None, +) -> Iterator[CollatedBatch]: + """Returns an iterator with the provided finalize_fn applied to items of the batch + iterator. + + This is the same as `collate` except the input batches can be of type Any. + + Args: + batch_iter: An iterator over processed batches. + finalize_fn: A function to apply to each batch. + stats: An optional stats object to record formatting times. + + Returns: + An iterator over batch index and the finalized batch. + """ + for batch in batch_iter: + with stats.iter_finalize_batch_s.timer() if stats else nullcontext(): + finalized_batch = finalize_fn(batch.data) + yield CollatedBatch(batch.batch_idx, finalized_batch) + + +def extract_data_from_batch(batch_iter: Iterator[Batch]) -> Iterator[Any]: + for batch in batch_iter: + yield batch.data + + +PREFETCHER_ACTOR_NAMESPACE = "ray.dataset" + + +class WaitBlockPrefetcher(BlockPrefetcher): + """Block prefetcher using ray.wait.""" + + def __init__(self): + self._blocks = [] + self._stopped = False + self._condition = threading.Condition() + self._thread = threading.Thread( + target=self._run, + name="Prefetcher", + daemon=True, + ) + self._thread.start() + + def _run(self): + while True: + try: + blocks_to_wait = [] + with self._condition: + if len(self._blocks) > 0: + blocks_to_wait, self._blocks = self._blocks[:], [] + else: + if self._stopped: + return + blocks_to_wait = [] + self._condition.wait() + if len(blocks_to_wait) > 0: + ray.wait(blocks_to_wait, num_returns=1, fetch_local=True) + except Exception: + logger.exception("Error in prefetcher thread.") + + def prefetch_blocks(self, blocks: List[ObjectRef[Block]]): + with self._condition: + if self._stopped: + raise RuntimeError("Prefetcher is stopped.") + self._blocks = blocks + self._condition.notify() + + def stop(self): + with self._condition: + if self._stopped: + return + self._stopped = True + self._condition.notify() + + def __del__(self): + self.stop() + + +class ActorBlockPrefetcher(BlockPrefetcher): + """Block prefetcher using a local actor.""" + + def __init__(self): + self.prefetch_actor = self._get_or_create_actor_prefetcher() + + @staticmethod + def _get_or_create_actor_prefetcher() -> "ActorHandle": + node_id = ray.get_runtime_context().get_node_id() + actor_name = f"dataset-block-prefetcher-{node_id}" + return _BlockPretcher.options( + scheduling_strategy=NodeAffinitySchedulingStrategy(node_id, soft=False), + name=actor_name, + namespace=PREFETCHER_ACTOR_NAMESPACE, + get_if_exists=True, + ).remote() + + def prefetch_blocks(self, blocks: List[ObjectRef[Block]]): + self.prefetch_actor.prefetch.remote(*blocks) + + +@ray.remote(num_cpus=0) +class _BlockPretcher: + """Helper actor that prefetches blocks asynchronously.""" + + def prefetch(self, *blocks) -> None: + pass diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..651554ab677a4fb0323e1649b0facf916a43f2f0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c167c14fa1f33b5aeaa59c1b0b4aa447fa7de114 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py @@ -0,0 +1,15 @@ +from .autoscaler import Autoscaler +from .autoscaling_actor_pool import AutoscalingActorPool +from .default_autoscaler import DefaultAutoscaler + + +def create_autoscaler(topology, resource_manager, execution_id): + return DefaultAutoscaler(topology, resource_manager, execution_id) + + +__all__ = [ + "Autoscaler", + "DefaultAutoscaler", + "create_autoscaler", + "AutoscalingActorPool", +] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..456fdcb5e97bd007f40e213fe96e67f649ffc6cb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py new file mode 100644 index 0000000000000000000000000000000000000000..c4f54584d6c534592948a287d0461bd9294afbb4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py @@ -0,0 +1,44 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from ray.data._internal.execution.interfaces.execution_options import ExecutionResources +from ray.util.annotations import DeveloperAPI + +if TYPE_CHECKING: + from ray.data._internal.execution.resource_manager import ResourceManager + from ray.data._internal.execution.streaming_executor_state import Topology + + +@DeveloperAPI +class Autoscaler(ABC): + """Abstract interface for Ray Data autoscaler.""" + + def __init__( + self, + topology: "Topology", + resource_manager: "ResourceManager", + execution_id: str, + ): + self._topology = topology + self._resource_manager = resource_manager + self._execution_id = execution_id + + @abstractmethod + def try_trigger_scaling(self): + """Try trigger autoscaling. + + This method will be called each time when StreamingExecutor makes + a scheduling decision. A subclass should override this method to + handle the autoscaling of both the cluster and `AutoscalingActorPool`s. + """ + ... + + @abstractmethod + def on_executor_shutdown(self): + """Callback when the StreamingExecutor is shutting down.""" + ... + + @abstractmethod + def get_total_resources(self) -> ExecutionResources: + """Get the total resources that are available to this data execution.""" + ... diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..2d8e1bd40b5718c6594b9c47a0538da8e06693dc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py @@ -0,0 +1,94 @@ +from abc import ABC, abstractmethod + +from ray.data._internal.execution.interfaces.execution_options import ExecutionResources +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +class AutoscalingActorPool(ABC): + """Abstract interface of an autoscaling actor pool. + + A `PhysicalOperator` can manage one or more `AutoscalingActorPool`s. + `Autoscaler` is responsible for deciding autoscaling of these actor + pools. + """ + + @abstractmethod + def min_size(self) -> int: + """Min size of the actor pool.""" + ... + + @abstractmethod + def max_size(self) -> int: + """Max size of the actor pool.""" + ... + + @abstractmethod + def current_size(self) -> int: + """Current size of the actor pool.""" + ... + + @abstractmethod + def num_running_actors(self) -> int: + """Number of running actors.""" + ... + + @abstractmethod + def num_active_actors(self) -> int: + """Number of actors with at least one active task.""" + ... + + @abstractmethod + def num_pending_actors(self) -> int: + """Number of actors pending creation.""" + ... + + @abstractmethod + def max_tasks_in_flight_per_actor(self) -> int: + """Max number of in-flight tasks per actor.""" + ... + + @abstractmethod + def current_in_flight_tasks(self) -> int: + """Number of current in-flight tasks.""" + ... + + def num_total_task_slots(self) -> int: + """Total number of task slots.""" + return self.max_tasks_in_flight_per_actor() * self.current_size() + + def num_free_task_slots(self) -> int: + """Number of free slots to run tasks.""" + return ( + self.max_tasks_in_flight_per_actor() * self.current_size() + - self.current_in_flight_tasks() + ) + + @abstractmethod + def scale_up(self, num_actors: int) -> int: + """Request the actor pool to scale up by the given number of actors. + + The number of actually added actors may be less than the requested + number. + + Returns: + The number of actors actually added. + """ + ... + + @abstractmethod + def scale_down(self, num_actors: int) -> int: + """Request actor pool to scale down by the given number of actors. + + The number of actually removed actors may be less than the requested + number. + + Returns: + The number of actors actually removed. + """ + ... + + @abstractmethod + def per_actor_resource_usage(self) -> ExecutionResources: + """Per actor resource usage.""" + ... diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py new file mode 100644 index 0000000000000000000000000000000000000000..778505f9fc905123f1ee11af26e1c31cc89f385e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py @@ -0,0 +1,188 @@ +import math +import time +from typing import TYPE_CHECKING, Dict + +import ray +from .autoscaler import Autoscaler +from .autoscaling_actor_pool import AutoscalingActorPool +from ray.data._internal.execution.autoscaling_requester import ( + get_or_create_autoscaling_requester_actor, +) +from ray.data._internal.execution.interfaces.execution_options import ExecutionResources + +if TYPE_CHECKING: + from ray.data._internal.execution.interfaces import PhysicalOperator + from ray.data._internal.execution.resource_manager import ResourceManager + from ray.data._internal.execution.streaming_executor_state import OpState, Topology + + +class DefaultAutoscaler(Autoscaler): + + # Default threshold of actor pool utilization to trigger scaling up. + DEFAULT_ACTOR_POOL_SCALING_UP_THRESHOLD: float = 0.8 + # Default threshold of actor pool utilization to trigger scaling down. + DEFAULT_ACTOR_POOL_SCALING_DOWN_THRESHOLD: float = 0.5 + + # Min number of seconds between two autoscaling requests. + MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS = 20 + + def __init__( + self, + topology: "Topology", + resource_manager: "ResourceManager", + execution_id: str, + actor_pool_scaling_up_threshold: float = DEFAULT_ACTOR_POOL_SCALING_UP_THRESHOLD, # noqa: E501 + actor_pool_scaling_down_threshold: float = DEFAULT_ACTOR_POOL_SCALING_DOWN_THRESHOLD, # noqa: E501 + ): + self._actor_pool_scaling_up_threshold = actor_pool_scaling_up_threshold + self._actor_pool_scaling_down_threshold = actor_pool_scaling_down_threshold + # Last time when a request was sent to Ray's autoscaler. + self._last_request_time = 0 + super().__init__(topology, resource_manager, execution_id) + + def try_trigger_scaling(self): + self._try_scale_up_cluster() + self._try_scale_up_or_down_actor_pool() + + def _calculate_actor_pool_util(self, actor_pool: AutoscalingActorPool): + """Calculate the utilization of the given actor pool.""" + if actor_pool.current_size() == 0: + return 0 + else: + return actor_pool.num_active_actors() / actor_pool.current_size() + + def _actor_pool_should_scale_up( + self, + actor_pool: AutoscalingActorPool, + op: "PhysicalOperator", + op_state: "OpState", + ): + # Do not scale up, if the op is completed or no more inputs are coming. + if op.completed() or (op._inputs_complete and op.internal_queue_size() == 0): + return False + if actor_pool.current_size() < actor_pool.min_size(): + # Scale up, if the actor pool is below min size. + return True + elif actor_pool.current_size() >= actor_pool.max_size(): + # Do not scale up, if the actor pool is already at max size. + return False + # Do not scale up, if the op does not have more resources. + if not op_state._scheduling_status.under_resource_limits: + return False + # Do not scale up, if the op has enough free slots for the existing inputs. + if op_state.num_queued() <= actor_pool.num_free_task_slots(): + return False + # Determine whether to scale up based on the actor pool utilization. + util = self._calculate_actor_pool_util(actor_pool) + return util > self._actor_pool_scaling_up_threshold + + def _actor_pool_should_scale_down( + self, + actor_pool: AutoscalingActorPool, + op: "PhysicalOperator", + ): + # Scale down, if the op is completed or no more inputs are coming. + if op.completed() or (op._inputs_complete and op.internal_queue_size() == 0): + return True + if actor_pool.current_size() > actor_pool.max_size(): + # Scale down, if the actor pool is above max size. + return True + elif actor_pool.current_size() <= actor_pool.min_size(): + # Do not scale down, if the actor pool is already at min size. + return False + # Determine whether to scale down based on the actor pool utilization. + util = self._calculate_actor_pool_util(actor_pool) + return util < self._actor_pool_scaling_down_threshold + + def _try_scale_up_or_down_actor_pool(self): + for op, state in self._topology.items(): + actor_pools = op.get_autoscaling_actor_pools() + for actor_pool in actor_pools: + while True: + # Try to scale up or down the actor pool. + should_scale_up = self._actor_pool_should_scale_up( + actor_pool, + op, + state, + ) + should_scale_down = self._actor_pool_should_scale_down( + actor_pool, op + ) + if should_scale_up and not should_scale_down: + if actor_pool.scale_up(1) == 0: + break + elif should_scale_down and not should_scale_up: + if actor_pool.scale_down(1) == 0: + break + else: + break + + def _try_scale_up_cluster(self): + """Try to scale up the cluster to accomodate the provided in-progress workload. + + This makes a resource request to Ray's autoscaler consisting of the current, + aggregate usage of all operators in the DAG + the incremental usage of all + operators that are ready for dispatch (i.e. that have inputs queued). If the + autoscaler were to grant this resource request, it would allow us to dispatch + one task for every ready operator. + + Note that this resource request does not take the global resource limits or the + liveness policy into account; it only tries to make the existing resource usage + + one more task per ready operator feasible in the cluster. + """ + # Limit the frequency of autoscaling requests. + now = time.time() + if now - self._last_request_time < self.MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS: + return + + # Scale up the cluster, if no ops are allowed to run, but there are still data + # in the input queues. + no_runnable_op = all( + op_state._scheduling_status.runnable is False + for _, op_state in self._topology.items() + ) + any_has_input = any( + op_state.num_queued() > 0 for _, op_state in self._topology.items() + ) + if not (no_runnable_op and any_has_input): + return + + self._last_request_time = now + + # Get resource usage for all ops + additional resources needed to launch one + # more task for each ready op. + resource_request = [] + + def to_bundle(resource: ExecutionResources) -> Dict: + req = {} + if resource.cpu: + req["CPU"] = math.ceil(resource.cpu) + if resource.gpu: + req["GPU"] = math.ceil(resource.gpu) + return req + + for op, state in self._topology.items(): + per_task_resource = op.incremental_resource_usage() + task_bundle = to_bundle(per_task_resource) + resource_request.extend([task_bundle] * op.num_active_tasks()) + # Only include incremental resource usage for ops that are ready for + # dispatch. + if state.num_queued() > 0: + # TODO(Clark): Scale up more aggressively by adding incremental resource + # usage for more than one bundle in the queue for this op? + resource_request.append(task_bundle) + + self._send_resource_request(resource_request) + + def _send_resource_request(self, resource_request): + # Make autoscaler resource request. + actor = get_or_create_autoscaling_requester_actor() + actor.request_resources.remote(resource_request, self._execution_id) + + def on_executor_shutdown(self): + # Make request for zero resources to autoscaler for this execution. + actor = get_or_create_autoscaling_requester_actor() + actor.request_resources.remote({}, self._execution_id) + + def get_total_resources(self) -> ExecutionResources: + return ExecutionResources.from_resource_dict(ray.cluster_resources()) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py new file mode 100644 index 0000000000000000000000000000000000000000..512c3c16f488130f32ac3993bc4b24a84e5ac4fd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py @@ -0,0 +1,131 @@ +import math +import threading +import time +from typing import Dict, List + +import ray +from ray.data.context import DataContext +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + +# Resource requests are considered stale after this number of seconds, and +# will be purged. +RESOURCE_REQUEST_TIMEOUT = 60 +PURGE_INTERVAL = RESOURCE_REQUEST_TIMEOUT * 2 + +# When the autoscaling is driven by memory pressure and there are abundant +# CPUs to support incremental CPUs needed to launch more tasks, we'll translate +# memory pressure into an artificial request of CPUs. The amount of CPUs we'll +# request is ARTIFICIAL_CPU_SCALING_FACTOR * ray.cluster_resources()["CPU"]. +ARTIFICIAL_CPU_SCALING_FACTOR = 1.2 + + +@ray.remote(num_cpus=0, max_restarts=-1, max_task_retries=-1) +class AutoscalingRequester: + """Actor to make resource requests to autoscaler for the datasets. + + The resource requests are set to timeout after RESOURCE_REQUEST_TIMEOUT seconds. + For those live requests, we keep track of the last request made for each execution, + which overrides all previous requests it made; then sum the requested amounts + across all executions as the final request to the autoscaler. + """ + + def __init__(self): + # execution_id -> (List[Dict], expiration timestamp) + self._resource_requests = {} + # TTL for requests. + self._timeout = RESOURCE_REQUEST_TIMEOUT + + self._self_handle = ray.get_runtime_context().current_actor + + # Start a thread to purge expired requests periodically. + def purge_thread_run(): + while True: + time.sleep(PURGE_INTERVAL) + # Call purge_expired_requests() as an actor task, + # so we don't need to handle multi-threading. + ray.get(self._self_handle.purge_expired_requests.remote()) + + self._purge_thread = threading.Thread(target=purge_thread_run, daemon=True) + self._purge_thread.start() + + def purge_expired_requests(self): + self._purge() + ray.autoscaler.sdk.request_resources(bundles=self._aggregate_requests()) + + def request_resources(self, req: List[Dict], execution_id: str): + # Purge expired requests before making request to autoscaler. + self._purge() + # For the same execution_id, we track the latest resource request and + # the its expiration timestamp. + self._resource_requests[execution_id] = ( + req, + time.time() + self._timeout, + ) + # We aggregate the resource requests across all execution_id's to Ray + # autoscaler. + ray.autoscaler.sdk.request_resources(bundles=self._aggregate_requests()) + + def _purge(self): + # Purge requests that are stale. + now = time.time() + for k, (_, t) in list(self._resource_requests.items()): + if t < now: + self._resource_requests.pop(k) + + def _aggregate_requests(self) -> List[Dict]: + req = [] + for _, (r, _) in self._resource_requests.items(): + req.extend(r) + + def get_cpus(req): + num_cpus = 0 + for r in req: + if "CPU" in r: + num_cpus += r["CPU"] + return num_cpus + + # Round up CPUs to exceed total cluster CPUs so it can actually upscale. + # This is to handle the issue where the autoscaling is driven by memory + # pressure (rather than CPUs) from streaming executor. In such case, simply + # asking for incremental CPUs (e.g. 1 CPU for each ready operator) may not + # actually be able to trigger autoscaling if existing CPUs in cluster can + # already satisfy the incremental CPUs request. + num_cpus = get_cpus(req) + if num_cpus > 0: + total = ray.cluster_resources() + if "CPU" in total and num_cpus <= total["CPU"]: + delta = ( + math.ceil(ARTIFICIAL_CPU_SCALING_FACTOR * total["CPU"]) - num_cpus + ) + req.extend([{"CPU": 1}] * delta) + + return req + + def _test_set_timeout(self, ttl): + """Set the timeout. This is for test only""" + self._timeout = ttl + + +# Creating/getting an actor from multiple threads is not safe. +# https://github.com/ray-project/ray/issues/41324 +_autoscaling_requester_lock: threading.RLock = threading.RLock() + + +def get_or_create_autoscaling_requester_actor(): + ctx = DataContext.get_current() + scheduling_strategy = ctx.scheduling_strategy + # Pin the stats actor to the local node so it fate-shares with the driver. + # Note: for Ray Client, the ray.get_runtime_context().get_node_id() should + # point to the head node. + scheduling_strategy = NodeAffinitySchedulingStrategy( + ray.get_runtime_context().get_node_id(), + soft=False, + ) + with _autoscaling_requester_lock: + return AutoscalingRequester.options( + name="AutoscalingRequester", + namespace="AutoscalingRequester", + get_if_exists=True, + lifetime="detached", + scheduling_strategy=scheduling_strategy, + ).remote() diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9d6ac177e97dda39e4f3b59c30d51efa30ddfc4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py @@ -0,0 +1,32 @@ +from typing import TYPE_CHECKING + +import ray +from .backpressure_policy import BackpressurePolicy +from .concurrency_cap_backpressure_policy import ConcurrencyCapBackpressurePolicy + +if TYPE_CHECKING: + from ray.data._internal.execution.streaming_executor_state import Topology + +# Default enabled backpressure policies and its config key. +# Use `DataContext.set_config` to config it. +ENABLED_BACKPRESSURE_POLICIES = [ + ConcurrencyCapBackpressurePolicy, +] +ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY = "backpressure_policies.enabled" + + +def get_backpressure_policies(topology: "Topology"): + data_context = ray.data.DataContext.get_current() + policies = data_context.get_config( + ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY, ENABLED_BACKPRESSURE_POLICIES + ) + + return [policy(topology) for policy in policies] + + +__all__ = [ + "BackpressurePolicy", + "ConcurrencyCapBackpressurePolicy", + "ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY", + "get_backpressure_policies", +] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30c092a4600a4b589df948302c0a037c941e6863 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b40ae77cf6fa4afe02f1a4da94ff252bdf1e97f7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6104096d5e45c84be9ca741b6d85d020f60ced4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..6577936e1dd610f22ab038ab9aa15b8c181467b8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py @@ -0,0 +1,28 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ray.data._internal.execution.interfaces.physical_operator import ( + PhysicalOperator, + ) + from ray.data._internal.execution.streaming_executor_state import Topology + + +class BackpressurePolicy(ABC): + """Interface for back pressure policies.""" + + @abstractmethod + def __init__(self, topology: "Topology"): + ... + + def can_add_input(self, op: "PhysicalOperator") -> bool: + """Determine if we can add a new input to the operator. If returns False, the + operator will be backpressured and will not be able to run new tasks. + Used in `streaming_executor_state.py::select_operator_to_run()`. + + Returns: True if we can add a new input to the operator, False otherwise. + + Note, if multiple backpressure policies are enabled, the operator will be + backpressured if any of the policies returns False. + """ + return True diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..a52bd1f6ab9f7a6e544bb07a0cacd008eb219e35 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py @@ -0,0 +1,43 @@ +import logging +from typing import TYPE_CHECKING + +from .backpressure_policy import BackpressurePolicy +from ray.data._internal.execution.operators.task_pool_map_operator import ( + TaskPoolMapOperator, +) + +if TYPE_CHECKING: + from ray.data._internal.execution.interfaces.physical_operator import ( + PhysicalOperator, + ) + from ray.data._internal.execution.streaming_executor_state import Topology + +logger = logging.getLogger(__name__) + + +class ConcurrencyCapBackpressurePolicy(BackpressurePolicy): + """A backpressure policy that caps the concurrency of each operator. + + The policy will limit the number of concurrently running tasks based on its + concurrency cap parameter. + + NOTE: Only support setting concurrency cap for `TaskPoolMapOperator` for now. + TODO(chengsu): Consolidate with actor scaling logic of `ActorPoolMapOperator`. + """ + + def __init__(self, topology: "Topology"): + self._concurrency_caps: dict["PhysicalOperator", float] = {} + + for op, _ in topology.items(): + if isinstance(op, TaskPoolMapOperator) and op.get_concurrency() is not None: + self._concurrency_caps[op] = op.get_concurrency() + else: + self._concurrency_caps[op] = float("inf") + + logger.debug( + "ConcurrencyCapBackpressurePolicy initialized with: " + f"{self._concurrency_caps}" + ) + + def can_add_input(self, op: "PhysicalOperator") -> bool: + return op.metrics.num_tasks_running < self._concurrency_caps[op] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa51465258e2838a6b55b928ad3c0d0361a90d38 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py @@ -0,0 +1,9 @@ +from .bundle_queue import BundleQueue +from .fifo_bundle_queue import FIFOBundleQueue + + +def create_bundle_queue() -> BundleQueue: + return FIFOBundleQueue() + + +__all__ = ["BundleQueue", "create_bundle_queue"] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..942c347ecff5056e5e1f0a1e7e5ba5e6c85a8584 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8b3baff6045d1ffe584b578f53d04227346cc41 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37b97ee9b061fe816f8af56abb5fe43ed6d4438f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py new file mode 100644 index 0000000000000000000000000000000000000000..f11bacf14c333febd11d840b3f4f369491a74f96 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py @@ -0,0 +1,62 @@ +import abc +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from ray.data._internal.execution.interfaces import RefBundle + + +class BundleQueue(abc.ABC): + @abc.abstractmethod + def __len__(self) -> int: + """Return the number of bundles in the queue.""" + ... + + @abc.abstractmethod + def __contains__(self, bundle: "RefBundle") -> bool: + """Return whether the bundle is in the queue.""" + ... + + @abc.abstractmethod + def add(self, bundle: "RefBundle") -> None: + """Add a bundle to the queue.""" + ... + + @abc.abstractmethod + def pop(self) -> "RefBundle": + """Remove and return the head of the queue. + + Raises: + IndexError: If the queue is empty. + """ + ... + + @abc.abstractmethod + def peek(self) -> Optional["RefBundle"]: + """Return the head of the queue without removing it. + + If the queue is empty, return `None`. + """ + ... + + @abc.abstractmethod + def remove(self, bundle: "RefBundle"): + """Remove a bundle from the queue.""" + ... + + @abc.abstractmethod + def clear(self): + """Remove all bundles from the queue.""" + ... + + @abc.abstractmethod + def estimate_size_bytes(self) -> int: + """Return an estimate of the total size of objects in the queue.""" + ... + + @abc.abstractmethod + def is_empty(self): + """Return whether this queue and all of its internal data structures are empty. + + This method is used for testing. + """ + ... diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py new file mode 100644 index 0000000000000000000000000000000000000000..8cc61a581088aafed4a36a4a94d16d9fd6cab1f4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py @@ -0,0 +1,40 @@ +from typing import List + +from ray.data.context import DataContext + +EXECUTION_CALLBACKS_CONFIG_KEY = "execution_callbacks" + + +class ExecutionCallback: + """Callback interface for execution events.""" + + def before_execution_starts(self): + """Called before the Dataset execution starts.""" + ... + + def after_execution_succeeds(self): + """Called after the Dataset execution succeeds.""" + ... + + def after_execution_fails(self, error: Exception): + """Called after the Dataset execution fails.""" + ... + + +def get_execution_callbacks(context: DataContext) -> List[ExecutionCallback]: + """Get all ExecutionCallbacks from the DataContext.""" + return context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, []) + + +def add_execution_callback(callback: ExecutionCallback, context: DataContext): + """Add an ExecutionCallback to the DataContext.""" + execution_callbacks = context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, []) + execution_callbacks.append(callback) + context.set_config(EXECUTION_CALLBACKS_CONFIG_KEY, execution_callbacks) + + +def remove_execution_callback(callback: ExecutionCallback, context: DataContext): + """Remove an ExecutionCallback from the DataContext.""" + execution_callbacks = context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, []) + execution_callbacks.remove(callback) + context.set_config(EXECUTION_CALLBACKS_CONFIG_KEY, execution_callbacks) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..32d84b64abddc45fe96e54e370ab189d57567d5e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py @@ -0,0 +1,19 @@ +from .common import NodeIdStr +from .execution_options import ExecutionOptions, ExecutionResources +from .executor import Executor, OutputIterator +from .physical_operator import PhysicalOperator +from .ref_bundle import RefBundle +from .task_context import TaskContext +from .transform_fn import AllToAllTransformFn + +__all__ = [ + "AllToAllTransformFn", + "ExecutionOptions", + "ExecutionResources", + "Executor", + "NodeIdStr", + "OutputIterator", + "PhysicalOperator", + "RefBundle", + "TaskContext", +] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py new file mode 100644 index 0000000000000000000000000000000000000000..a337c90e7dcce2c62f39bb917e31114a814f195a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py @@ -0,0 +1,2 @@ +# Node id string returned by `ray.get_runtime_context().get_node_id()`. +NodeIdStr = str diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py new file mode 100644 index 0000000000000000000000000000000000000000..8000901992bc1ba672dc893222975c95aad095ef --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py @@ -0,0 +1,300 @@ +import os +from typing import Dict, List, Optional, Union + +from .common import NodeIdStr +from ray.data._internal.execution.util import memory_string +from ray.util.annotations import DeveloperAPI + + +class ExecutionResources: + """Specifies resources usage or resource limits for execution. + + By default this class represents resource usage. Use `for_limits` or + set `default_to_inf` to True to create an object that represents resource limits. + """ + + def __init__( + self, + cpu: Optional[float] = None, + gpu: Optional[float] = None, + object_store_memory: Optional[float] = None, + default_to_inf: bool = False, + ): + """Initializes ExecutionResources. + Args: + cpu: Amount of logical CPU slots. + gpu: Amount of logical GPU slots. + object_store_memory: Amount of object store memory. + default_to_inf: When the object represents resource usage, this flag + should be set to False. And missing values will default to 0. + When the object represents resource limits, this flag should be + set to True. And missing values will default to infinity. + """ + self._cpu = cpu + self._gpu = gpu + self._object_store_memory = object_store_memory + self._default_to_inf = default_to_inf + + @classmethod + def from_resource_dict( + cls, + resource_dict: Dict[str, float], + default_to_inf: bool = False, + ): + """Create an ExecutionResources object from a resource dict.""" + return ExecutionResources( + cpu=resource_dict.get("CPU", None), + gpu=resource_dict.get("GPU", None), + object_store_memory=resource_dict.get("object_store_memory", None), + default_to_inf=default_to_inf, + ) + + @classmethod + def for_limits( + cls, + cpu: Optional[float] = None, + gpu: Optional[float] = None, + object_store_memory: Optional[float] = None, + ) -> "ExecutionResources": + """Create an ExecutionResources object that represents resource limits. + Args: + cpu: Amount of logical CPU slots. + gpu: Amount of logical GPU slots. + object_store_memory: Amount of object store memory. + """ + return ExecutionResources( + cpu=cpu, + gpu=gpu, + object_store_memory=object_store_memory, + default_to_inf=True, + ) + + @property + def cpu(self) -> float: + if self._cpu is not None: + return self._cpu + return 0.0 if not self._default_to_inf else float("inf") + + @cpu.setter + def cpu(self, value: float): + self._cpu = value + + @property + def gpu(self) -> float: + if self._gpu is not None: + return self._gpu + return 0.0 if not self._default_to_inf else float("inf") + + @gpu.setter + def gpu(self, value: float): + self._gpu = value + + @property + def object_store_memory(self) -> float: + if self._object_store_memory is not None: + return self._object_store_memory + return 0.0 if not self._default_to_inf else float("inf") + + @object_store_memory.setter + def object_store_memory(self, value: float): + self._object_store_memory = value + + def __repr__(self): + return ( + f"ExecutionResources(cpu={self.cpu:.1f}, gpu={self.gpu:.1f}, " + f"object_store_memory={self.object_store_memory_str()})" + ) + + def __eq__(self, other: "ExecutionResources") -> bool: + return ( + self.cpu == other.cpu + and self.gpu == other.gpu + and self.object_store_memory == other.object_store_memory + ) + + @classmethod + def zero(cls) -> "ExecutionResources": + """Returns an ExecutionResources object with zero resources.""" + return ExecutionResources(0.0, 0.0, 0.0) + + def is_zero(self) -> bool: + """Returns True if all resources are zero.""" + return self.cpu == 0.0 and self.gpu == 0.0 and self.object_store_memory == 0.0 + + def is_non_negative(self) -> bool: + """Returns True if all resources are non-negative.""" + return self.cpu >= 0 and self.gpu >= 0 and self.object_store_memory >= 0 + + def object_store_memory_str(self) -> str: + """Returns a human-readable string for the object store memory field.""" + if self.object_store_memory == float("inf"): + return "inf" + return memory_string(self.object_store_memory) + + def copy(self) -> "ExecutionResources": + """Returns a copy of this ExecutionResources object.""" + return ExecutionResources( + self._cpu, self._gpu, self._object_store_memory, self._default_to_inf + ) + + def add(self, other: "ExecutionResources") -> "ExecutionResources": + """Adds execution resources. + + Returns: + A new ExecutionResource object with summed resources. + """ + return ExecutionResources( + self.cpu + other.cpu, + self.gpu + other.gpu, + self.object_store_memory + other.object_store_memory, + ) + + def subtract(self, other: "ExecutionResources") -> "ExecutionResources": + """Subtracts execution resources. + + Returns: + A new ExecutionResource object with subtracted resources. + """ + return ExecutionResources( + self.cpu - other.cpu, + self.gpu - other.gpu, + self.object_store_memory - other.object_store_memory, + ) + + def max(self, other: "ExecutionResources") -> "ExecutionResources": + """Returns the maximum for each resource type.""" + return ExecutionResources( + cpu=max(self.cpu, other.cpu), + gpu=max(self.gpu, other.gpu), + object_store_memory=max( + self.object_store_memory, other.object_store_memory + ), + ) + + def min(self, other: "ExecutionResources") -> "ExecutionResources": + """Returns the minimum for each resource type.""" + return ExecutionResources( + cpu=min(self.cpu, other.cpu), + gpu=min(self.gpu, other.gpu), + object_store_memory=min( + self.object_store_memory, other.object_store_memory + ), + ) + + def satisfies_limit(self, limit: "ExecutionResources") -> bool: + """Return if this resource struct meets the specified limits. + + Note that None for a field means no limit. + """ + return ( + self.cpu <= limit.cpu + and self.gpu <= limit.gpu + and self.object_store_memory <= limit.object_store_memory + ) + + def scale(self, f: float) -> "ExecutionResources": + """Return copy with all set values scaled by `f`.""" + if f < 0: + raise ValueError("Scaling factor must be non-negative.") + if f == 0: + # Explicitly handle the zero case, because `0 * inf` is undefined. + return ExecutionResources.zero() + return ExecutionResources( + cpu=self.cpu * f, + gpu=self.gpu * f, + object_store_memory=self.object_store_memory * f, + ) + + +@DeveloperAPI +class ExecutionOptions: + """Common options for execution. + + Some options may not be supported on all executors (e.g., resource limits). + + Attributes: + resource_limits: Set a soft limit on the resource usage during execution. + Autodetected by default. + exclude_resources: Amount of resources to exclude from Ray Data. + Set this if you have other workloads running on the same cluster. + Note, + - If using Ray Data with Ray Train, training resources will be + automatically excluded. + - For each resource type, resource_limits and exclude_resources can + not be both set. + locality_with_output: Set this to prefer running tasks on the same node as the + output node (node driving the execution). It can also be set to a list of + node ids to spread the outputs across those nodes. Off by default. + preserve_order: Set this to preserve the ordering between blocks processed by + operators. Off by default. + actor_locality_enabled: Whether to enable locality-aware task dispatch to + actors (off by default). This parameter applies to both stateful map and + streaming_split operations. + verbose_progress: Whether to report progress individually per operator. By + default, only AllToAll operators and global progress is reported. This + option is useful for performance debugging. On by default. + """ + + def __init__( + self, + resource_limits: Optional[ExecutionResources] = None, + exclude_resources: Optional[ExecutionResources] = None, + locality_with_output: Union[bool, List[NodeIdStr]] = False, + preserve_order: bool = False, + # TODO(hchen): Re-enable `actor_locality_enabled` by default after fixing + # https://github.com/ray-project/ray/issues/43466 + actor_locality_enabled: bool = False, + verbose_progress: Optional[bool] = None, + ): + if resource_limits is None: + resource_limits = ExecutionResources.for_limits() + self.resource_limits = resource_limits + if exclude_resources is None: + exclude_resources = ExecutionResources.zero() + self.exclude_resources = exclude_resources + self.locality_with_output = locality_with_output + self.preserve_order = preserve_order + self.actor_locality_enabled = actor_locality_enabled + if verbose_progress is None: + verbose_progress = bool( + int(os.environ.get("RAY_DATA_VERBOSE_PROGRESS", "1")) + ) + self.verbose_progress = verbose_progress + + def __repr__(self) -> str: + return ( + f"ExecutionOptions(resource_limits={self.resource_limits}, " + f"exclude_resources={self.exclude_resources}, " + f"locality_with_output={self.locality_with_output}, " + f"preserve_order={self.preserve_order}, " + f"actor_locality_enabled={self.actor_locality_enabled}, " + f"verbose_progress={self.verbose_progress})" + ) + + @property + def resource_limits(self) -> ExecutionResources: + return self._resource_limits + + @resource_limits.setter + def resource_limits(self, value: ExecutionResources) -> None: + self._resource_limits = ExecutionResources.for_limits( + cpu=value._cpu, + gpu=value._gpu, + object_store_memory=value._object_store_memory, + ) + + def is_resource_limits_default(self): + """Returns True if resource_limits is the default value.""" + return self._resource_limits == ExecutionResources.for_limits() + + def validate(self) -> None: + """Validate the options.""" + for attr in ["cpu", "gpu", "object_store_memory"]: + if ( + getattr(self.resource_limits, attr) != float("inf") + and getattr(self.exclude_resources, attr, 0) > 0 + ): + raise ValueError( + "resource_limits and exclude_resources cannot " + f" both be set for {attr} resource." + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py new file mode 100644 index 0000000000000000000000000000000000000000..007346b60f29473252b888e717d67149e58343c1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py @@ -0,0 +1,77 @@ +from typing import Iterable, Iterator, Optional + +from .execution_options import ExecutionOptions +from .physical_operator import PhysicalOperator +from .ref_bundle import RefBundle +from ray.data._internal.stats import DatasetStats + + +class OutputIterator(Iterator[RefBundle]): + """Iterator used to access the output of an Executor execution. + + This is a blocking iterator. Datasets guarantees that all its iterators are + thread-safe (i.e., multiple threads can block on them at the same time). + """ + + def __init__(self, base: Iterable[RefBundle]): + self._it = iter(base) + + def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle: + """Can be used to pull outputs by a specified output index. + + This is used to support the streaming_split() API, where the output of a + streaming execution is to be consumed by multiple processes. + + Args: + output_split_idx: The output split index to get results for. This arg is + only allowed for iterators created by `Dataset.streaming_split()`. + + Raises: + StopIteration if there are no more outputs to return. + """ + if output_split_idx is not None: + raise NotImplementedError() + return next(self._it) + + def __next__(self) -> RefBundle: + return self.get_next() + + +class Executor: + """Abstract class for executors, which implement physical operator execution. + + Subclasses: + StreamingExecutor + """ + + def __init__(self, options: ExecutionOptions): + """Create the executor.""" + options.validate() + self._options = options + + def execute( + self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None + ) -> OutputIterator: + """Start execution. + + Args: + dag: The operator graph to execute. + initial_stats: The DatasetStats to prepend to the stats returned by the + executor. These stats represent actions done to compute inputs. + """ + raise NotImplementedError + + def shutdown(self): + """Shutdown an executor, which may still be running. + + This should interrupt execution and clean up any used resources. + """ + pass + + def get_stats(self) -> DatasetStats: + """Return stats for the execution so far. + + This is generally called after `execute` has completed, but may be called + while iterating over `execute` results for streaming execution. + """ + raise NotImplementedError diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..c70c6fbecd60fd22fb65d81214e025d291ca3c20 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py @@ -0,0 +1,574 @@ +import time +from dataclasses import Field, dataclass, field +from enum import Enum +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +import ray +from ray.data._internal.execution.bundle_queue import create_bundle_queue +from ray.data._internal.execution.interfaces.ref_bundle import RefBundle +from ray.data._internal.memory_tracing import trace_allocation + +if TYPE_CHECKING: + from ray.data._internal.execution.interfaces.physical_operator import ( + PhysicalOperator, + ) + + +# A metadata key used to mark a dataclass field as a metric. +_IS_FIELD_METRIC_KEY = "__is_metric" +# Metadata keys used to store information about a metric. +_METRIC_FIELD_DESCRIPTION_KEY = "__metric_description" +_METRIC_FIELD_METRICS_GROUP_KEY = "__metric_metrics_group" +_METRIC_FIELD_IS_MAP_ONLY_KEY = "__metric_is_map_only" + +_METRICS: List["MetricDefinition"] = [] + + +class MetricsGroup(Enum): + INPUTS = "inputs" + OUTPUTS = "outputs" + TASKS = "tasks" + OBJECT_STORE_MEMORY = "object_store_memory" + MISC = "misc" + + +@dataclass(frozen=True) +class MetricDefinition: + """Metadata for a metric. + + Args: + name: The name of the metric. + description: A human-readable description of the metric, also used as the chart + description on the Ray Data dashboard. + metrics_group: The group of the metric, used to organize metrics into groups in + 'StatsActor' and on the Ray Data dashboard. + map_only: Whether the metric is only measured for 'MapOperators'. + """ + + name: str + description: str + metrics_group: str + # TODO: Let's refactor this parameter so it isn't tightly coupled with a specific + # operator type (MapOperator). + map_only: bool = False + + +def metric_field( + *, + description: str, + metrics_group: str, + map_only: bool = False, + **field_kwargs, +): + """A dataclass field that represents a metric.""" + metadata = field_kwargs.get("metadata", {}) + + metadata[_IS_FIELD_METRIC_KEY] = True + + metadata[_METRIC_FIELD_DESCRIPTION_KEY] = description + metadata[_METRIC_FIELD_METRICS_GROUP_KEY] = metrics_group + metadata[_METRIC_FIELD_IS_MAP_ONLY_KEY] = map_only + + return field(metadata=metadata, **field_kwargs) + + +def metric_property( + *, + description: str, + metrics_group: str, + map_only: bool = False, +): + """A property that represents a metric.""" + + def wrap(func): + metric = MetricDefinition( + name=func.__name__, + description=description, + metrics_group=metrics_group, + map_only=map_only, + ) + + _METRICS.append(metric) + + return property(func) + + return wrap + + +@dataclass +class RunningTaskInfo: + inputs: RefBundle + num_outputs: int + bytes_outputs: int + + +class OpRuntimesMetricsMeta(type): + def __init__(cls, name, bases, dict): + # NOTE: `Field.name` isn't set until the dataclass is created, so we can't + # create the metrics in `metric_field` directly. + super().__init__(name, bases, dict) + + # Iterate over the attributes and methods of 'OpRuntimeMetrics'. + for name, value in dict.items(): + # If an attribute is a dataclass field and has _IS_FIELD_METRIC_KEY in its + # metadata, then create a metric from the field metadata and add it to the + # list of metrics. See also the 'metric_field' function. + if isinstance(value, Field) and value.metadata.get(_IS_FIELD_METRIC_KEY): + metric = MetricDefinition( + name=name, + description=value.metadata[_METRIC_FIELD_DESCRIPTION_KEY], + metrics_group=value.metadata[_METRIC_FIELD_METRICS_GROUP_KEY], + map_only=value.metadata[_METRIC_FIELD_IS_MAP_ONLY_KEY], + ) + _METRICS.append(metric) + + +@dataclass +class OpRuntimeMetrics(metaclass=OpRuntimesMetricsMeta): + """Runtime metrics for a 'PhysicalOperator'. + + Metrics are updated dynamically during the execution of the Dataset. + This class can be used for either observablity or scheduling purposes. + + DO NOT modify the fields of this class directly. Instead, use the provided + callback methods. + """ + + # TODO(hchen): Fields tagged with "map_only" currently only work for MapOperator. + # We should make them work for all operators by unifying the task execution code. + + # === Inputs-related metrics === + num_inputs_received: int = metric_field( + default=0, + description="Number of input blocks received by operator.", + metrics_group=MetricsGroup.INPUTS, + ) + bytes_inputs_received: int = metric_field( + default=0, + description="Byte size of input blocks received by operator.", + metrics_group=MetricsGroup.INPUTS, + ) + num_task_inputs_processed: int = metric_field( + default=0, + description=( + "Number of input blocks that operator's tasks have finished processing." + ), + metrics_group=MetricsGroup.INPUTS, + map_only=True, + ) + bytes_task_inputs_processed: int = metric_field( + default=0, + description=( + "Byte size of input blocks that operator's tasks have finished processing." + ), + metrics_group=MetricsGroup.INPUTS, + map_only=True, + ) + bytes_inputs_of_submitted_tasks: int = metric_field( + default=0, + description="Byte size of input blocks passed to submitted tasks.", + metrics_group=MetricsGroup.INPUTS, + map_only=True, + ) + + # === Outputs-related metrics === + num_task_outputs_generated: int = metric_field( + default=0, + description="Number of output blocks generated by tasks.", + metrics_group=MetricsGroup.OUTPUTS, + map_only=True, + ) + bytes_task_outputs_generated: int = metric_field( + default=0, + description="Byte size of output blocks generated by tasks.", + metrics_group=MetricsGroup.OUTPUTS, + map_only=True, + ) + rows_task_outputs_generated: int = metric_field( + default=0, + description="Number of output rows generated by tasks.", + metrics_group=MetricsGroup.OUTPUTS, + map_only=True, + ) + num_outputs_taken: int = metric_field( + default=0, + description=( + "Number of output blocks that are already taken by downstream operators." + ), + metrics_group=MetricsGroup.OUTPUTS, + ) + bytes_outputs_taken: int = metric_field( + default=0, + description=( + "Byte size of output blocks that are already taken by downstream operators." + ), + metrics_group=MetricsGroup.OUTPUTS, + ) + num_outputs_of_finished_tasks: int = metric_field( + default=0, + description="Number of generated output blocks that are from finished tasks.", + metrics_group=MetricsGroup.OUTPUTS, + map_only=True, + ) + bytes_outputs_of_finished_tasks: int = metric_field( + default=0, + description=( + "Byte size of generated output blocks that are from finished tasks." + ), + metrics_group=MetricsGroup.OUTPUTS, + map_only=True, + ) + + # === Tasks-related metrics === + num_tasks_submitted: int = metric_field( + default=0, + description="Number of submitted tasks.", + metrics_group=MetricsGroup.TASKS, + map_only=True, + ) + num_tasks_running: int = metric_field( + default=0, + description="Number of running tasks.", + metrics_group=MetricsGroup.TASKS, + map_only=True, + ) + num_tasks_have_outputs: int = metric_field( + default=0, + description="Number of tasks that already have output.", + metrics_group=MetricsGroup.TASKS, + map_only=True, + ) + num_tasks_finished: int = metric_field( + default=0, + description="Number of finished tasks.", + metrics_group=MetricsGroup.TASKS, + map_only=True, + ) + num_tasks_failed: int = metric_field( + default=0, + description="Number of failed tasks.", + metrics_group=MetricsGroup.TASKS, + map_only=True, + ) + block_generation_time: float = metric_field( + default=0, + description="Time spent generating blocks in tasks.", + metrics_group=MetricsGroup.TASKS, + map_only=True, + ) + task_submission_backpressure_time: float = metric_field( + default=0, + description="Time spent in task submission backpressure.", + metrics_group=MetricsGroup.TASKS, + ) + + # === Object store memory metrics === + obj_store_mem_internal_inqueue_blocks: int = metric_field( + default=0, + description="Number of blocks in operator's internal input queue.", + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + ) + obj_store_mem_internal_outqueue_blocks: int = metric_field( + default=0, + description="Number of blocks in the operator's internal output queue.", + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + ) + obj_store_mem_freed: int = metric_field( + default=0, + description="Byte size of freed memory in object store.", + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + map_only=True, + ) + obj_store_mem_spilled: int = metric_field( + default=0, + description="Byte size of spilled memory in object store.", + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + map_only=True, + ) + obj_store_mem_used: int = metric_field( + default=0, + description="Byte size of used memory in object store.", + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + ) + + # === Miscellaneous metrics === + # Use "metrics_group: "misc" in the metadata for new metrics in this section. + + def __init__(self, op: "PhysicalOperator"): + from ray.data._internal.execution.operators.map_operator import MapOperator + + self._op = op + self._is_map = isinstance(op, MapOperator) + self._running_tasks: Dict[int, RunningTaskInfo] = {} + self._extra_metrics: Dict[str, Any] = {} + # Start time of current pause due to task submission backpressure + self._task_submission_backpressure_start_time = -1 + + self._internal_inqueue = create_bundle_queue() + self._internal_outqueue = create_bundle_queue() + self._pending_task_inputs = create_bundle_queue() + + @property + def extra_metrics(self) -> Dict[str, Any]: + """Return a dict of extra metrics.""" + return self._extra_metrics + + @classmethod + def get_metrics(self) -> List[MetricDefinition]: + return list(_METRICS) + + def as_dict(self): + """Return a dict representation of the metrics.""" + result = [] + for metric in self.get_metrics(): + if not self._is_map and metric.map_only: + continue + value = getattr(self, metric.name) + result.append((metric.name, value)) + + # TODO: record resource usage in OpRuntimeMetrics, + # avoid calling self._op.current_processor_usage() + resource_usage = self._op.current_processor_usage() + result.extend( + [ + ("cpu_usage", resource_usage.cpu or 0), + ("gpu_usage", resource_usage.gpu or 0), + ] + ) + result.extend(self._extra_metrics.items()) + return dict(result) + + @metric_property( + description="Average number of blocks generated per task.", + metrics_group=MetricsGroup.OUTPUTS, + map_only=True, + ) + def average_num_outputs_per_task(self) -> Optional[float]: + """Average number of output blocks per task, or None if no task has finished.""" + if self.num_tasks_finished == 0: + return None + else: + return self.num_outputs_of_finished_tasks / self.num_tasks_finished + + @metric_property( + description="Average size of task output in bytes.", + metrics_group=MetricsGroup.OUTPUTS, + map_only=True, + ) + def average_bytes_per_output(self) -> Optional[float]: + """Average size in bytes of output blocks.""" + if self.num_task_outputs_generated == 0: + return None + else: + return self.bytes_task_outputs_generated / self.num_task_outputs_generated + + @metric_property( + description="Byte size of input blocks in the operator's internal input queue.", + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + ) + def obj_store_mem_internal_inqueue(self) -> int: + return self._internal_inqueue.estimate_size_bytes() + + @metric_property( + description=( + "Byte size of output blocks in the operator's internal output queue." + ), + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + ) + def obj_store_mem_internal_outqueue(self) -> int: + return self._internal_outqueue.estimate_size_bytes() + + @metric_property( + description="Byte size of input blocks used by pending tasks.", + metrics_group=MetricsGroup.OBJECT_STORE_MEMORY, + map_only=True, + ) + def obj_store_mem_pending_task_inputs(self) -> int: + return self._pending_task_inputs.estimate_size_bytes() + + @property + def obj_store_mem_pending_task_outputs(self) -> Optional[float]: + """Estimated size in bytes of output blocks in Ray generator buffers. + + If an estimate isn't available, this property returns ``None``. + """ + per_task_output = self.obj_store_mem_max_pending_output_per_task + if per_task_output is None: + return None + + # Ray Data launches multiple tasks per actor, but only one task runs at a + # time per actor. So, the number of actually running tasks is capped by the + # number of active actors. + from ray.data._internal.execution.operators.actor_pool_map_operator import ( + ActorPoolMapOperator, + ) + + num_tasks_running = self.num_tasks_running + if isinstance(self._op, ActorPoolMapOperator): + num_tasks_running = min( + num_tasks_running, self._op._actor_pool.num_active_actors() + ) + + return num_tasks_running * per_task_output + + @property + def obj_store_mem_max_pending_output_per_task(self) -> Optional[float]: + """Estimated size in bytes of output blocks in a task's generator buffer.""" + context = self._op.data_context + if context._max_num_blocks_in_streaming_gen_buffer is None: + return None + + bytes_per_output = self.average_bytes_per_output + if bytes_per_output is None: + bytes_per_output = context.target_max_block_size + + num_pending_outputs = context._max_num_blocks_in_streaming_gen_buffer + if self.average_num_outputs_per_task is not None: + num_pending_outputs = min( + num_pending_outputs, self.average_num_outputs_per_task + ) + return bytes_per_output * num_pending_outputs + + @metric_property( + description="Average size of task inputs in bytes.", + metrics_group=MetricsGroup.INPUTS, + map_only=True, + ) + def average_bytes_inputs_per_task(self) -> Optional[float]: + """Average size in bytes of ref bundles passed to tasks, or ``None`` if no + tasks have been submitted.""" + if self.num_tasks_submitted == 0: + return None + else: + return self.bytes_inputs_of_submitted_tasks / self.num_tasks_submitted + + @metric_property( + description="Average total output size of task in bytes.", + metrics_group=MetricsGroup.OUTPUTS, + map_only=True, + ) + def average_bytes_outputs_per_task(self) -> Optional[float]: + """Average size in bytes of output blocks per task, + or None if no task has finished.""" + if self.num_tasks_finished == 0: + return None + else: + return self.bytes_outputs_of_finished_tasks / self.num_tasks_finished + + def on_input_received(self, input: RefBundle): + """Callback when the operator receives a new input.""" + self.num_inputs_received += 1 + self.bytes_inputs_received += input.size_bytes() + + def on_input_queued(self, input: RefBundle): + """Callback when the operator queues an input.""" + self.obj_store_mem_internal_inqueue_blocks += len(input.blocks) + self._internal_inqueue.add(input) + + def on_input_dequeued(self, input: RefBundle): + """Callback when the operator dequeues an input.""" + self.obj_store_mem_internal_inqueue_blocks -= len(input.blocks) + input_size = input.size_bytes() + self._internal_inqueue.remove(input) + assert self.obj_store_mem_internal_inqueue >= 0, ( + self._op, + self.obj_store_mem_internal_inqueue, + input_size, + ) + + def on_output_queued(self, output: RefBundle): + """Callback when an output is queued by the operator.""" + self.obj_store_mem_internal_outqueue_blocks += len(output.blocks) + self._internal_outqueue.add(output) + + def on_output_dequeued(self, output: RefBundle): + """Callback when an output is dequeued by the operator.""" + self.obj_store_mem_internal_outqueue_blocks -= len(output.blocks) + output_size = output.size_bytes() + self._internal_outqueue.remove(output) + assert self.obj_store_mem_internal_outqueue >= 0, ( + self._op, + self.obj_store_mem_internal_outqueue, + output_size, + ) + + def on_toggle_task_submission_backpressure(self, in_backpressure): + if in_backpressure and self._task_submission_backpressure_start_time == -1: + # backpressure starting, start timer + self._task_submission_backpressure_start_time = time.perf_counter() + elif self._task_submission_backpressure_start_time != -1: + # backpressure stopping, stop timer + self.task_submission_backpressure_time += ( + time.perf_counter() - self._task_submission_backpressure_start_time + ) + self._task_submission_backpressure_start_time = -1 + + def on_output_taken(self, output: RefBundle): + """Callback when an output is taken from the operator.""" + self.num_outputs_taken += 1 + self.bytes_outputs_taken += output.size_bytes() + + def on_task_submitted(self, task_index: int, inputs: RefBundle): + """Callback when the operator submits a task.""" + self.num_tasks_submitted += 1 + self.num_tasks_running += 1 + self.bytes_inputs_of_submitted_tasks += inputs.size_bytes() + self._pending_task_inputs.add(inputs) + self._running_tasks[task_index] = RunningTaskInfo(inputs, 0, 0) + + def on_task_output_generated(self, task_index: int, output: RefBundle): + """Callback when a new task generates an output.""" + num_outputs = len(output) + output_bytes = output.size_bytes() + + self.num_task_outputs_generated += num_outputs + self.bytes_task_outputs_generated += output_bytes + + task_info = self._running_tasks[task_index] + if task_info.num_outputs == 0: + self.num_tasks_have_outputs += 1 + task_info.num_outputs += num_outputs + task_info.bytes_outputs += output_bytes + + for block_ref, meta in output.blocks: + assert meta.exec_stats and meta.exec_stats.wall_time_s + self.block_generation_time += meta.exec_stats.wall_time_s + assert meta.num_rows is not None + self.rows_task_outputs_generated += meta.num_rows + trace_allocation(block_ref, "operator_output") + + def on_task_finished(self, task_index: int, exception: Optional[Exception]): + """Callback when a task is finished.""" + self.num_tasks_running -= 1 + self.num_tasks_finished += 1 + if exception is not None: + self.num_tasks_failed += 1 + + task_info = self._running_tasks[task_index] + self.num_outputs_of_finished_tasks += task_info.num_outputs + self.bytes_outputs_of_finished_tasks += task_info.bytes_outputs + + inputs = self._running_tasks[task_index].inputs + self.num_task_inputs_processed += len(inputs) + total_input_size = inputs.size_bytes() + self.bytes_task_inputs_processed += total_input_size + input_size = inputs.size_bytes() + self._pending_task_inputs.remove(inputs) + assert self.obj_store_mem_pending_task_inputs >= 0, ( + self._op, + self.obj_store_mem_pending_task_inputs, + input_size, + ) + + ctx = self._op.data_context + if ctx.enable_get_object_locations_for_metrics: + locations = ray.experimental.get_object_locations(inputs.block_refs) + for block, meta in inputs.blocks: + if locations[block].get("did_spill", False): + assert meta.size_bytes is not None + self.obj_store_mem_spilled += meta.size_bytes + + self.obj_store_mem_freed += total_input_size + + inputs.destroy_if_owned() + del self._running_tasks[task_index] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..dff5ac476b5127567a2993928ce1aa73fb2f1d10 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py @@ -0,0 +1,535 @@ +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, Iterator, List, Optional, Union + +import ray +from .ref_bundle import RefBundle +from ray._raylet import ObjectRefGenerator +from ray.data._internal.execution.autoscaler.autoscaling_actor_pool import ( + AutoscalingActorPool, +) +from ray.data._internal.execution.interfaces.execution_options import ( + ExecutionOptions, + ExecutionResources, +) +from ray.data._internal.execution.interfaces.op_runtime_metrics import OpRuntimeMetrics +from ray.data._internal.logical.interfaces import LogicalOperator, Operator +from ray.data._internal.stats import StatsDict +from ray.data.context import DataContext + +# TODO(hchen): Ray Core should have a common interface for these two types. +Waitable = Union[ray.ObjectRef, ObjectRefGenerator] + + +class OpTask(ABC): + """Abstract class that represents a task that is created by an PhysicalOperator. + + The task can be either a regular task or an actor task. + """ + + def __init__(self, task_index: int): + self._task_index = task_index + + def task_index(self) -> int: + """Return the index of the task.""" + return self._task_index + + @abstractmethod + def get_waitable(self) -> Waitable: + """Return the ObjectRef or ObjectRefGenerator to wait on.""" + pass + + +class DataOpTask(OpTask): + """Represents an OpTask that handles Block data.""" + + def __init__( + self, + task_index: int, + streaming_gen: ObjectRefGenerator, + output_ready_callback: Callable[[RefBundle], None], + task_done_callback: Callable[[Optional[Exception]], None], + ): + """ + Args: + streaming_gen: The streaming generator of this task. It should yield blocks. + output_ready_callback: The callback to call when a new RefBundle is output + from the generator. + task_done_callback: The callback to call when the task is done. + """ + super().__init__(task_index) + # TODO(hchen): Right now, the streaming generator is required to yield a Block + # and a BlockMetadata each time. We should unify task submission with an unified + # interface. So each individual operator don't need to take care of the + # BlockMetadata. + self._streaming_gen = streaming_gen + self._output_ready_callback = output_ready_callback + self._task_done_callback = task_done_callback + + def get_waitable(self) -> ObjectRefGenerator: + return self._streaming_gen + + def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int: + """Callback when data is ready to be read from the streaming generator. + + Args: + max_bytes_to_read: Max bytes of blocks to read. If None, all available + will be read. + Returns: The number of blocks read. + """ + bytes_read = 0 + while max_bytes_to_read is None or bytes_read < max_bytes_to_read: + try: + block_ref = self._streaming_gen._next_sync(0) + if block_ref.is_nil(): + # The generator currently doesn't have new output. + # And it's not stopped yet. + break + except StopIteration: + self._task_done_callback(None) + break + + try: + meta = ray.get(next(self._streaming_gen)) + except StopIteration: + # The generator should always yield 2 values (block and metadata) + # each time. If we get a StopIteration here, it means an error + # happened in the task. + # And in this case, the block_ref is the exception object. + # TODO(hchen): Ray Core should have a better interface for + # detecting and obtaining the exception. + try: + ray.get(block_ref) + assert False, "Above ray.get should raise an exception." + except Exception as ex: + self._task_done_callback(ex) + raise ex from None + self._output_ready_callback( + RefBundle([(block_ref, meta)], owns_blocks=True) + ) + bytes_read += meta.size_bytes + return bytes_read + + +class MetadataOpTask(OpTask): + """Represents an OpTask that only handles metadata, instead of Block data.""" + + def __init__( + self, + task_index: int, + object_ref: ray.ObjectRef, + task_done_callback: Callable[[], None], + ): + """ + Args: + object_ref: The ObjectRef of the task. + task_done_callback: The callback to call when the task is done. + """ + super().__init__(task_index) + self._object_ref = object_ref + self._task_done_callback = task_done_callback + + def get_waitable(self) -> ray.ObjectRef: + return self._object_ref + + def on_task_finished(self): + """Callback when the task is finished.""" + self._task_done_callback() + + +class PhysicalOperator(Operator): + """Abstract class for physical operators. + + An operator transforms one or more input streams of RefBundles into a single + output stream of RefBundles. + + Physical operators are stateful and non-serializable; they live on the driver side + of the Dataset only. + + Here's a simple example of implementing a basic "Map" operator: + + class MapOperator(PhysicalOperator): + def __init__(self): + self.active_tasks = [] + + def add_input(self, refs, _): + self.active_tasks.append(map_task.remote(refs)) + + def has_next(self): + ready, _ = ray.wait(self.active_tasks, timeout=0) + return len(ready) > 0 + + def get_next(self): + ready, remaining = ray.wait(self.active_tasks, num_returns=1) + self.active_tasks = remaining + return ready[0] + + Note that the above operator fully supports both bulk and streaming execution, + since `add_input` and `get_next` can be called in any order. In bulk execution + (now deprecated), all inputs would be added up-front, but in streaming + execution (now the default execution mode) the calls could be interleaved. + """ + + def __init__( + self, + name: str, + input_dependencies: List["PhysicalOperator"], + data_context: DataContext, + target_max_block_size: Optional[int], + ): + super().__init__(name, input_dependencies) + + for x in input_dependencies: + assert isinstance(x, PhysicalOperator), x + self._inputs_complete = not input_dependencies + self._target_max_block_size = target_max_block_size + self._started = False + self._in_task_submission_backpressure = False + self._in_task_output_backpressure = False + self._metrics = OpRuntimeMetrics(self) + self._estimated_num_output_bundles = None + self._estimated_output_num_rows = None + self._execution_completed = False + # The LogicalOperator(s) which were translated to create this PhysicalOperator. + # Set via `PhysicalOperator.set_logical_operators()`. + self._logical_operators: List[LogicalOperator] = [] + self._data_context = data_context + + def __reduce__(self): + raise ValueError("Operator is not serializable.") + + @property + def data_context(self) -> DataContext: + return self._data_context + + # Override the following 3 methods to correct type hints. + + @property + def input_dependencies(self) -> List["PhysicalOperator"]: + return super().input_dependencies # type: ignore + + @property + def output_dependencies(self) -> List["PhysicalOperator"]: + return super().output_dependencies # type: ignore + + def post_order_iter(self) -> Iterator["PhysicalOperator"]: + return super().post_order_iter() # type: ignore + + def set_logical_operators( + self, + *logical_ops: LogicalOperator, + ): + self._logical_operators = list(logical_ops) + + @property + def target_max_block_size(self) -> Optional[int]: + """ + Target max block size output by this operator. If this returns None, + then the default from DataContext should be used. + """ + return self._target_max_block_size + + @property + def actual_target_max_block_size(self) -> int: + """ + The actual target max block size output by this operator. + """ + target_max_block_size = self._target_max_block_size + if target_max_block_size is None: + target_max_block_size = self.data_context.target_max_block_size + return target_max_block_size + + def set_target_max_block_size(self, target_max_block_size: Optional[int]): + self._target_max_block_size = target_max_block_size + + def mark_execution_completed(self): + """Manually mark this operator has completed execution.""" + self._execution_completed = True + + def completed(self) -> bool: + """Return True when this operator is completed. + + An operator is completed the operator has stopped execution and all + outputs are taken. + """ + if not self._execution_completed: + if self._inputs_complete and self.num_active_tasks() == 0: + # If all inputs are complete and there are no active tasks, + # then the operator has completed execution. + self._execution_completed = True + return self._execution_completed and not self.has_next() + + def get_stats(self) -> StatsDict: + """Return recorded execution stats for use with DatasetStats.""" + raise NotImplementedError + + @property + def metrics(self) -> OpRuntimeMetrics: + """Returns the runtime metrics of this operator.""" + self._metrics._extra_metrics = self._extra_metrics() + return self._metrics + + def _extra_metrics(self) -> Dict[str, Any]: + """Subclasses should override this method to report extra metrics + that are specific to them.""" + return {} + + def progress_str(self) -> str: + """Return any extra status to be displayed in the operator progress bar. + + For example, ` actors` to show current number of actors in an actor pool. + """ + return "" + + def num_outputs_total(self) -> Optional[int]: + """Returns the total number of output bundles of this operator, + or ``None`` if unable to provide a reasonable estimate (for example, + if no tasks have finished yet). + + The value returned may be an estimate based off the consumption so far. + This is useful for reporting progress. + + Subclasses should either override this method, or update + ``self._estimated_num_output_bundles`` appropriately. + """ + return self._estimated_num_output_bundles + + def num_output_rows_total(self) -> Optional[int]: + """Returns the total number of output rows of this operator, + or ``None`` if unable to provide a reasonable estimate (for example, + if no tasks have finished yet). + + The value returned may be an estimate based off the consumption so far. + This is useful for reporting progress. + + Subclasses should either override this method, or update + ``self._estimated_output_num_rows`` appropriately. + """ + return self._estimated_output_num_rows + + def start(self, options: ExecutionOptions) -> None: + """Called by the executor when execution starts for an operator. + + Args: + options: The global options used for the overall execution. + """ + self._started = True + + def should_add_input(self) -> bool: + """Return whether it is desirable to add input to this operator right now. + + Operators can customize the implementation of this method to apply additional + backpressure (e.g., waiting for internal actors to be created). + """ + return True + + def add_input(self, refs: RefBundle, input_index: int) -> None: + """Called when an upstream result is available. + + Inputs may be added in any order, and calls to `add_input` may be interleaved + with calls to `get_next` / `has_next` to implement streaming execution. + + Subclasses should override `_add_input_inner` instead of this method. + + Args: + refs: The ref bundle that should be added as input. + input_index: The index identifying the input dependency producing the + input. For most operators, this is always `0` since there is only + one upstream input operator. + """ + self._metrics.on_input_received(refs) + self._add_input_inner(refs, input_index) + + def _add_input_inner(self, refs: RefBundle, input_index: int) -> None: + """Subclasses should override this method to implement `add_input`.""" + raise NotImplementedError + + def input_done(self, input_index: int) -> None: + """Called when the upstream operator at index `input_index` has completed(). + + After this is called, the executor guarantees that no more inputs will be added + via `add_input` for the given input index. + """ + pass + + def all_inputs_done(self) -> None: + """Called when all upstream operators have completed(). + + After this is called, the executor guarantees that no more inputs will be added + via `add_input` for any input index. + """ + self._inputs_complete = True + + def has_next(self) -> bool: + """Returns when a downstream output is available. + + When this returns true, it is safe to call `get_next()`. + """ + raise NotImplementedError + + def get_next(self) -> RefBundle: + """Get the next downstream output. + + It is only allowed to call this if `has_next()` has returned True. + + Subclasses should override `_get_next_inner` instead of this method. + """ + output = self._get_next_inner() + self._metrics.on_output_taken(output) + return output + + def _get_next_inner(self) -> RefBundle: + """Subclasses should override this method to implement `get_next`.""" + raise NotImplementedError + + def get_active_tasks(self) -> List[OpTask]: + """Get a list of the active tasks of this operator. + + Subclasses should return *all* running normal/actor tasks. The + StreamingExecutor will wait on these tasks and trigger callbacks. + """ + return [] + + def num_active_tasks(self) -> int: + """Return the number of active tasks. + + This method is used for 2 purposes: + * Determine if this operator is completed. + * Displaying active task info in the progress bar. + Thus, the return value can be less than `len(get_active_tasks())`, + if some tasks are not needed for the above purposes. E.g., for the + actor pool map operator, readiness checking tasks can be excluded + from `num_active_tasks`, but they should be included in + `get_active_tasks`. + + Subclasses can override this as a performance optimization. + """ + return len(self.get_active_tasks()) + + def throttling_disabled(self) -> bool: + """Whether to disable resource throttling for this operator. + + This should return True for operators that only manipulate bundle metadata + (e.g., the OutputSplitter operator). This hints to the execution engine that + these operators should not be throttled based on resource usage. + """ + return False + + def internal_queue_size(self) -> int: + """If the operator has an internal input queue, return its size. + + This is used to report tasks pending submission to actor pools. + """ + return 0 + + def shutdown(self) -> None: + """Abort execution and release all resources used by this operator. + + This release any Ray resources acquired by this operator such as active + tasks, actors, and objects. + """ + if not self._started: + raise ValueError("Operator must be started before being shutdown.") + + def current_processor_usage(self) -> ExecutionResources: + """Returns the current estimated CPU and GPU usage of this operator, excluding + object store memory. + + This method is called by the executor to decide how to allocate processors + between different operators. + """ + return ExecutionResources(0, 0, 0) + + def running_processor_usage(self) -> ExecutionResources: + """Returns the estimated running CPU and GPU usage of this operator, excluding + object store memory. + + This method is called by the resource manager and the streaming + executor to display the number of currently running CPUs and GPUs in the + progress bar. + + Note, this method returns `current_processor_usage() - + pending_processor_usage()` by default. Subclasses should only override + `pending_processor_usage()` if needed. + """ + usage = self.current_processor_usage() + usage = usage.subtract(self.pending_processor_usage()) + return usage + + def pending_processor_usage(self) -> ExecutionResources: + """Returns the estimated pending CPU and GPU usage of this operator, excluding + object store memory. + + This method is called by the resource manager and the streaming + executor to display the number of currently pending actors in the + progress bar. + """ + return ExecutionResources(0, 0, 0) + + def base_resource_usage(self) -> ExecutionResources: + """Returns the minimum amount of resources required for execution. + + For example, an operator that creates an actor pool requiring 8 GPUs could + return ExecutionResources(gpu=8) as its base usage. + """ + return ExecutionResources() + + def incremental_resource_usage(self) -> ExecutionResources: + """Returns the incremental resources required for processing another input. + + For example, an operator that launches a task per input could return + ExecutionResources(cpu=1) as its incremental usage. + """ + return ExecutionResources() + + def notify_in_task_submission_backpressure(self, in_backpressure: bool) -> None: + """Called periodically from the executor to update internal in backpressure + status for stats collection purposes. + + Args: + in_backpressure: Value this operator's in_backpressure should be set to. + """ + # only update on change to in_backpressure + if self._in_task_submission_backpressure != in_backpressure: + self._metrics.on_toggle_task_submission_backpressure(in_backpressure) + self._in_task_submission_backpressure = in_backpressure + + def get_autoscaling_actor_pools(self) -> List[AutoscalingActorPool]: + """Return a list of `AutoscalingActorPool`s managed by this operator.""" + return [] + + def implements_accurate_memory_accounting(self) -> bool: + """Return whether this operator implements accurate memory accounting. + + An operator that implements accurate memory accounting should should properly + report its memory usage via the following APIs: + - `self._metrics.on_input_queued`. + - `self._metrics.on_input_dequeued`. + - `self._metrics.on_output_queued`. + - `self._metrics.on_output_dequeued`. + """ + # TODO(hchen): Currently we only enable `ReservationOpResourceAllocator` when + # all operators in the dataset have implemented accurate memory accounting. + # Eventually all operators should implement accurate memory accounting. + return False + + def supports_fusion(self) -> bool: + """Returns ```True``` if this operator can be fused with other operators.""" + return False + + def update_resource_usage(self) -> None: + """Updates resource usage of this operator at runtime. + + This method will be called at runtime in each StreamingExecutor iteration. + Subclasses can override it to account for dynamic resource usage updates due to + restarting actors, retrying tasks, lost objects, etc. + """ + pass + + def actor_info_progress_str(self) -> str: + """Returns Actor progress strings for Alive, Restarting and Pending Actors. + + This method will be called in summary_str API in OpState. Subcallses can + override it to return Actor progress strings for Alive, Restarting and Pending + Actors. + """ + return "" diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py new file mode 100644 index 0000000000000000000000000000000000000000..758b22215051e62035cc06dd594d29a32169253a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py @@ -0,0 +1,136 @@ +from dataclasses import dataclass +from typing import Iterator, List, Optional, Tuple + +import ray +from .common import NodeIdStr +from ray.data._internal.memory_tracing import trace_deallocation +from ray.data.block import Block, BlockMetadata +from ray.data.context import DataContext +from ray.types import ObjectRef + + +@dataclass +class RefBundle: + """A group of data block references and their metadata. + + Operators take in and produce streams of RefBundles. + + Most commonly a RefBundle consists of a single block object reference. + In some cases, e.g., due to block splitting, or for a reduce task, there may + be more than one block. + + Block bundles have ownership semantics, i.e., shared ownership (similar to C++ + shared_ptr, multiple operators share the same block bundle), or unique ownership + (similar to C++ unique_ptr, only one operator owns the block bundle). This + allows operators to know whether they can destroy blocks when they don't need + them. Destroying blocks eagerly is more efficient than waiting for Python GC / + Ray reference counting to kick in. + """ + + # The size_bytes must be known in the metadata, num_rows is optional. + blocks: Tuple[Tuple[ObjectRef[Block], BlockMetadata]] + + # Whether we own the blocks (can safely destroy them). + owns_blocks: bool + + # This attribute is used by the split() operator to assign bundles to logical + # output splits. It is otherwise None. + output_split_idx: Optional[int] = None + + # Cached location, used for get_cached_location(). + _cached_location: Optional[NodeIdStr] = None + + def __post_init__(self): + if not isinstance(self.blocks, tuple): + object.__setattr__(self, "blocks", tuple(self.blocks)) + for b in self.blocks: + assert isinstance(b, tuple), b + assert len(b) == 2, b + assert isinstance(b[0], ray.ObjectRef), b + assert isinstance(b[1], BlockMetadata), b + if b[1].size_bytes is None: + raise ValueError( + "The size in bytes of the block must be known: {}".format(b) + ) + + def __setattr__(self, key, value): + if hasattr(self, key) and key in ["blocks", "owns_blocks"]: + raise ValueError(f"The `{key}` field of RefBundle cannot be updated.") + object.__setattr__(self, key, value) + + @property + def block_refs(self) -> List[ObjectRef[Block]]: + """List of block references in this bundle.""" + return [block_ref for block_ref, _ in self.blocks] + + @property + def metadata(self) -> List[BlockMetadata]: + """List of block metadata in this bundle.""" + return [metadata for _, metadata in self.blocks] + + def num_rows(self) -> Optional[int]: + """Number of rows present in this bundle, if known.""" + total = 0 + for m in self.metadata: + if m.num_rows is None: + return None + else: + total += m.num_rows + return total + + def size_bytes(self) -> int: + """Size of the blocks of this bundle in bytes.""" + return sum(m.size_bytes for m in self.metadata) + + def destroy_if_owned(self) -> int: + """Clears the object store memory for these blocks if owned. + + Returns: + The number of bytes freed. + """ + should_free = self.owns_blocks and DataContext.get_current().eager_free + for block_ref in self.block_refs: + trace_deallocation( + block_ref, "RefBundle.destroy_if_owned", free=should_free + ) + return self.size_bytes() if should_free else 0 + + def get_cached_location(self) -> Optional[NodeIdStr]: + """Return a location for this bundle's data, if possible. + + Caches the resolved location so multiple calls to this are efficient. + """ + if self._cached_location is None: + # Only consider the first block in the bundle for now. TODO(ekl) consider + # taking into account other blocks. + ref = self.block_refs[0] + # This call is pretty fast for owned objects (~5k/s), so we don't need to + # batch it for now. + locs = ray.experimental.get_object_locations([ref]) + nodes = locs[ref]["node_ids"] + if nodes: + self._cached_location = nodes[0] + else: + self._cached_location = "" + if self._cached_location: + return self._cached_location + else: + return None # Return None if cached location is "". + + def __eq__(self, other) -> bool: + return self is other + + def __hash__(self) -> int: + return id(self) + + def __len__(self) -> int: + return len(self.blocks) + + +def _ref_bundles_iterator_to_block_refs_list( + ref_bundles: Iterator[RefBundle], +) -> List[ObjectRef[Block]]: + """Convert an iterator of RefBundles to a list of Block object references.""" + return [ + block_ref for ref_bundle in ref_bundles for block_ref in ref_bundle.block_refs + ] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py new file mode 100644 index 0000000000000000000000000000000000000000..094faf2440e01a1f92917f2d4ecb690fbb482257 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Dict, Optional + +from ray.data._internal.progress_bar import ProgressBar + +if TYPE_CHECKING: + from ray.data._internal.execution.operators.map_transformer import MapTransformer + + +@dataclass +class TaskContext: + """This describes the information of a task running block transform.""" + + # The index of task. Each task has a unique task index within the same + # operator. + task_idx: int + + # The dictionary of sub progress bar to update. The key is name of sub progress + # bar. Note this is only used on driver side. + # TODO(chengsu): clean it up from TaskContext with new optimizer framework. + sub_progress_bar_dict: Optional[Dict[str, ProgressBar]] = None + + # NOTE(hchen): `upstream_map_transformer` and `upstream_map_ray_remote_args` + # are only used for `RandomShuffle`. DO NOT use them for other operators. + # Ideally, they should be handled by the optimizer, and should be transparent + # to the specific operators. + # But for `RandomShuffle`, the AllToAllOperator doesn't do the shuffle itself. + # It uses `ExchangeTaskScheduler` to launch new tasks to do the shuffle. + # That's why we need to pass them to `ExchangeTaskScheduler`. + # TODO(hchen): Use a physical operator to do the shuffle directly. + + # The underlying function called in a MapOperator; this is used when fusing + # an AllToAllOperator with an upstream MapOperator. + upstream_map_transformer: Optional["MapTransformer"] = None + + # The Ray remote arguments of the fused upstream MapOperator. + # This should be set if upstream_map_transformer is set. + upstream_map_ray_remote_args: Optional[Dict[str, Any]] = None + + # The target maximum number of bytes to include in the task's output block. + target_max_block_size: Optional[int] = None + + # Additional keyword arguments passed to the task. + kwargs: Dict[str, Any] = field(default_factory=dict) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py new file mode 100644 index 0000000000000000000000000000000000000000..6a4e13d8a08cc2cb4db008b8128b60c90716cae6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py @@ -0,0 +1,10 @@ +from typing import Callable, List, Tuple + +from .ref_bundle import RefBundle +from .task_context import TaskContext +from ray.data._internal.stats import StatsDict + +# Block transform function applied in AllToAllOperator. +AllToAllTransformFn = Callable[ + [List[RefBundle], TaskContext], Tuple[List[RefBundle], StatsDict] +] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..3ef7ca0d08979f8e3ca8735f6557d7afba5142e5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py @@ -0,0 +1,181 @@ +"""This file contains temporary helper functions for legacy plan/executor interaction. + +It should be deleted once we fully move to the new executor backend. +""" + +from typing import Iterator, Optional, Tuple + +from ray.data._internal.block_list import BlockList +from ray.data._internal.execution.interfaces import ( + Executor, + PhysicalOperator, + RefBundle, +) +from ray.data._internal.execution.interfaces.executor import OutputIterator +from ray.data._internal.logical.optimizers import get_execution_plan +from ray.data._internal.logical.util import record_operators_usage +from ray.data._internal.plan import ExecutionPlan +from ray.data._internal.stats import DatasetStats +from ray.data._internal.util import unify_block_metadata_schema +from ray.data.block import BlockMetadata + +# Warn about tasks larger than this. +TASK_SIZE_WARN_THRESHOLD_BYTES = 100000 + + +def execute_to_legacy_bundle_iterator( + executor: Executor, + plan: ExecutionPlan, + dag_rewrite=None, +) -> Iterator[RefBundle]: + """Execute a plan with the new executor and return a bundle iterator. + + Args: + executor: The executor to use. + plan: The legacy plan to execute. + dag_rewrite: Callback that can be used to mutate the DAG prior to execution. + This is currently used as a legacy hack to inject the OutputSplit operator + for `Dataset.streaming_split()`. + + Returns: + The output as a bundle iterator. + """ + dag, stats = _get_execution_dag( + executor, + plan, + preserve_order=False, + ) + if dag_rewrite: + dag = dag_rewrite(dag) + + bundle_iter = executor.execute(dag, initial_stats=stats) + + class CacheMetadataIterator(OutputIterator): + """Wrapper for `bundle_iterator` above. + + For a given iterator which yields output RefBundles, + collect the metadata from each output bundle, and yield the + original RefBundle. Only after the entire iterator is exhausted, + we cache the resulting metadata to the execution plan.""" + + def __init__(self, base_iterator: OutputIterator): + # Note: the base_iterator should be of type StreamIterator, + # defined within `StreamingExecutor.execute()`. It must + # support the `get_next()` method. + self._base_iterator = base_iterator + self._collected_metadata = BlockMetadata( + num_rows=0, + size_bytes=0, + schema=None, + input_files=None, + exec_stats=None, + ) + + def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle: + try: + bundle = self._base_iterator.get_next(output_split_idx) + self._collect_metadata(bundle) + return bundle + except StopIteration: + # Once the iterator is completely exhausted, we are done + # collecting metadata. We can add this cached metadata to the plan. + plan._snapshot_metadata = self._collected_metadata + raise + + def _collect_metadata(self, bundle: RefBundle) -> RefBundle: + """Collect the metadata from each output bundle and accumulate + results, so we can access important information, such as + row count, schema, etc., after iteration completes.""" + self._collected_metadata.num_rows += bundle.num_rows() + self._collected_metadata.size_bytes += bundle.size_bytes() + self._collected_metadata.schema = unify_block_metadata_schema( + [self._collected_metadata, *bundle.metadata] + ) + return bundle + + bundle_iter = CacheMetadataIterator(bundle_iter) + return bundle_iter + + +def execute_to_legacy_block_list( + executor: Executor, + plan: ExecutionPlan, + dataset_uuid: str, + preserve_order: bool, +) -> BlockList: + """Execute a plan with the new executor and translate it into a legacy block list. + + Args: + executor: The executor to use. + plan: The legacy plan to execute. + dataset_uuid: UUID of the dataset for this execution. + preserve_order: Whether to preserve order in execution. + + Returns: + The output as a legacy block list. + """ + dag, stats = _get_execution_dag( + executor, + plan, + preserve_order, + ) + bundles = executor.execute(dag, initial_stats=stats) + block_list = _bundles_to_block_list(bundles) + # Set the stats UUID after execution finishes. + _set_stats_uuid_recursive(executor.get_stats(), dataset_uuid) + return block_list + + +def _get_execution_dag( + executor: Executor, + plan: ExecutionPlan, + preserve_order: bool, +) -> Tuple[PhysicalOperator, DatasetStats]: + """Get the physical operators DAG from a plan.""" + # Record usage of logical operators if available. + if hasattr(plan, "_logical_plan") and plan._logical_plan is not None: + record_operators_usage(plan._logical_plan.dag) + + # Get DAG of physical operators and input statistics. + dag = get_execution_plan(plan._logical_plan).dag + stats = _get_initial_stats_from_plan(plan) + + # Enforce to preserve ordering if the plan has operators + # required to do so, such as Zip and Sort. + if preserve_order or plan.require_preserve_order(): + executor._options.preserve_order = True + + return dag, stats + + +def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatasetStats: + if plan._snapshot_bundle is not None: + return plan._snapshot_stats + # For Datasets created from "read_xxx", `plan._in_stats` contains useless data. + # For Datasets created from "from_xxx", we need to use `plan._in_stats` as + # the initial stats. Because the `FromXxx` logical operators will be translated to + # "InputDataBuffer" physical operators, which will be ignored when generating + # stats, see `StreamingExecutor._generate_stats`. + # TODO(hchen): Unify the logic by saving the initial stats in `InputDataBuffer + if plan.has_lazy_input(): + return DatasetStats(metadata={}, parent=None) + else: + return plan._in_stats + + +def _bundles_to_block_list(bundles: Iterator[RefBundle]) -> BlockList: + blocks, metadata = [], [] + owns_blocks = True + for ref_bundle in bundles: + if not ref_bundle.owns_blocks: + owns_blocks = False + blocks.extend(ref_bundle.block_refs) + metadata.extend(ref_bundle.metadata) + return BlockList(blocks, metadata, owned_by_consumer=owns_blocks) + + +def _set_stats_uuid_recursive(stats: DatasetStats, dataset_uuid: str) -> None: + if not stats.dataset_uuid: + stats.dataset_uuid = dataset_uuid + for parent in stats.parents or []: + _set_stats_uuid_recursive(parent, dataset_uuid) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..558d623705ebbc26715cd78ef3d3f1c4c1714631 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fffcf8a3ad8fd23c15fcc06594527e62b932b17 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7d2c7919976c09ec84ca8ee48f8121222a72189 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d30669a7129fd1d8d3791abe4942dfbb24cffc5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..298292165d941b16b07c7621bfa92676fcf0bd73 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a512dcf0a22387c3319206c19c22bb5204fcfa8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4dc5236fda229f1bf6423312ba18b99ef4a86f94 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8607ed8ce80000981ab0cf72fba95905717a3141 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/output_splitter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/output_splitter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8aebec64b074ec15ab5d052d4db8f0b5defbe9e2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/output_splitter.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/task_pool_map_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/task_pool_map_operator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93f5f21b75a0dd48e235f6b152e5060c2af02ac7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/task_pool_map_operator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/union_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/union_operator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..818cc09371bcfd3f36fba4b6e648fcf4fc832827 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/union_operator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/zip_operator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/zip_operator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d37831a9d92bb64bfc4d0f265d60c413237fe46e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/zip_operator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/actor_pool_map_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..678ff6c0d5bbd4ddc5004c0f69cc88d3bc318815 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/actor_pool_map_operator.py @@ -0,0 +1,777 @@ +import logging +from dataclasses import dataclass +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union + +import ray +from ray.actor import ActorHandle +from ray.core.generated import gcs_pb2 +from ray.data._internal.compute import ActorPoolStrategy +from ray.data._internal.execution.autoscaler import AutoscalingActorPool +from ray.data._internal.execution.bundle_queue import create_bundle_queue +from ray.data._internal.execution.interfaces import ( + ExecutionOptions, + ExecutionResources, + NodeIdStr, + PhysicalOperator, + RefBundle, + TaskContext, +) +from ray.data._internal.execution.operators.map_operator import MapOperator, _map_task +from ray.data._internal.execution.operators.map_transformer import MapTransformer +from ray.data._internal.execution.util import locality_string +from ray.data._internal.remote_fn import _add_system_error_to_retry_exceptions +from ray.data.block import Block, BlockMetadata +from ray.data.context import DataContext +from ray.types import ObjectRef + +logger = logging.getLogger(__name__) + +# Higher values here are better for prefetching and locality. It's ok for this to be +# fairly high since streaming backpressure prevents us from overloading actors. +DEFAULT_MAX_TASKS_IN_FLIGHT = 4 + + +class ActorPoolMapOperator(MapOperator): + """A MapOperator implementation that executes tasks on an actor pool. + + This class manages the state of a pool of actors used for task execution, as well + as dispatch of tasks to those actors. + + It operates in two modes. In bulk mode, tasks are queued internally and executed + when the operator has free actor slots. In streaming mode, the streaming executor + only adds input when `should_add_input() = True` (i.e., there are free slots). + This allows for better control of backpressure (e.g., suppose we go over memory + limits after adding put, then there isn't any way to "take back" the inputs prior + to actual execution). + """ + + def __init__( + self, + map_transformer: MapTransformer, + input_op: PhysicalOperator, + data_context: DataContext, + target_max_block_size: Optional[int], + compute_strategy: ActorPoolStrategy, + name: str = "ActorPoolMap", + min_rows_per_bundle: Optional[int] = None, + supports_fusion: bool = True, + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + ray_remote_args: Optional[Dict[str, Any]] = None, + ): + """Create an ActorPoolMapOperator instance. + + Args: + transform_fn: The function to apply to each ref bundle input. + init_fn: The callable class to instantiate on each actor. + input_op: Operator generating input data for this op. + compute_strategy: ComputeStrategy used for this operator. + name: The name of this operator. + target_max_block_size: The target maximum number of bytes to + include in an output block. + min_rows_per_bundle: The number of rows to gather per batch passed to the + transform_fn, or None to use the block size. Setting the batch size is + important for the performance of GPU-accelerated transform functions. + The actual rows passed may be less if the dataset is small. + supports_fusion: Whether this operator supports fusion with other operators. + ray_remote_args_fn: A function that returns a dictionary of remote args + passed to each map worker. The purpose of this argument is to generate + dynamic arguments for each actor/task, and will be called each time + prior to initializing the worker. Args returned from this dict will + always override the args in ``ray_remote_args``. Note: this is an + advanced, experimental feature. + ray_remote_args: Customize the ray remote args for this op's tasks. + See :func:`ray.remote` for details. + """ + super().__init__( + map_transformer, + input_op, + data_context, + name, + target_max_block_size, + min_rows_per_bundle, + supports_fusion, + ray_remote_args_fn, + ray_remote_args, + ) + self._ray_actor_task_remote_args = {} + actor_task_errors = self.data_context.actor_task_retry_on_errors + if actor_task_errors: + self._ray_actor_task_remote_args["retry_exceptions"] = actor_task_errors + _add_system_error_to_retry_exceptions(self._ray_actor_task_remote_args) + data_context = self.data_context + if data_context._max_num_blocks_in_streaming_gen_buffer is not None: + # The `_generator_backpressure_num_objects` parameter should be + # `2 * _max_num_blocks_in_streaming_gen_buffer` because we yield + # 2 objects for each block: the block and the block metadata. + self._ray_actor_task_remote_args["_generator_backpressure_num_objects"] = ( + 2 * data_context._max_num_blocks_in_streaming_gen_buffer + ) + self._min_rows_per_bundle = min_rows_per_bundle + self._ray_remote_args_fn = ray_remote_args_fn + self._ray_remote_args = self._apply_default_remote_args( + self._ray_remote_args, data_context + ) + + per_actor_resource_usage = ExecutionResources( + cpu=self._ray_remote_args.get("num_cpus", 0), + gpu=self._ray_remote_args.get("num_gpus", 0), + ) + self._actor_pool = _ActorPool( + compute_strategy, self._start_actor, per_actor_resource_usage + ) + # A queue of bundles awaiting dispatch to actors. + self._bundle_queue = create_bundle_queue() + # Cached actor class. + self._cls = None + # Whether no more submittable bundles will be added. + self._inputs_done = False + + def internal_queue_size(self) -> int: + return len(self._bundle_queue) + + def start(self, options: ExecutionOptions): + self._actor_locality_enabled = options.actor_locality_enabled + super().start(options) + + # Create the actor workers and add them to the pool. + self._cls = ray.remote(**self._ray_remote_args)(_MapWorker) + self._actor_pool.scale_up(self._actor_pool.min_size()) + refs = self._actor_pool.get_pending_actor_refs() + + # We synchronously wait for the initial number of actors to start. This avoids + # situations where the scheduler is unable to schedule downstream operators + # due to lack of available actors, causing an initial "pileup" of objects on + # upstream operators, leading to a spike in memory usage prior to steady state. + logger.debug(f"{self._name}: Waiting for {len(refs)} pool actors to start...") + try: + timeout = self.data_context.wait_for_min_actors_s + ray.get(refs, timeout=timeout) + except ray.exceptions.GetTimeoutError: + raise ray.exceptions.GetTimeoutError( + "Timed out while starting actors. " + "This may mean that the cluster does not have " + "enough resources for the requested actor pool." + ) + + def should_add_input(self) -> bool: + return self._actor_pool.num_free_slots() > 0 + + def _start_actor(self): + """Start a new actor and add it to the actor pool as a pending actor.""" + assert self._cls is not None + ctx = self.data_context + if self._ray_remote_args_fn: + self._refresh_actor_cls() + actor = self._cls.remote( + ctx, + src_fn_name=self.name, + map_transformer=self._map_transformer, + ) + res_ref = actor.get_location.remote() + + def _task_done_callback(res_ref): + # res_ref is a future for a now-ready actor; move actor from pending to the + # active actor pool. + has_actor = self._actor_pool.pending_to_running(res_ref) + if not has_actor: + # Actor has already been killed. + return + # A new actor has started, we try to dispatch queued tasks. + self._dispatch_tasks() + + self._submit_metadata_task( + res_ref, + lambda: _task_done_callback(res_ref), + ) + return actor, res_ref + + def _add_bundled_input(self, bundle: RefBundle): + self._bundle_queue.add(bundle) + self._metrics.on_input_queued(bundle) + # Try to dispatch all bundles in the queue, including this new bundle. + self._dispatch_tasks() + + def _dispatch_tasks(self): + """Try to dispatch tasks from the bundle buffer to the actor pool. + + This is called when: + * a new input bundle is added, + * a task finishes, + * a new worker has been created. + """ + while self._bundle_queue: + # Pick an actor from the pool. + if self._actor_locality_enabled: + actor = self._actor_pool.pick_actor(self._bundle_queue.peek()) + else: + actor = self._actor_pool.pick_actor() + if actor is None: + # No actors available for executing the next task. + break + # Submit the map task. + bundle = self._bundle_queue.pop() + self._metrics.on_input_dequeued(bundle) + input_blocks = [block for block, _ in bundle.blocks] + ctx = TaskContext( + task_idx=self._next_data_task_idx, + target_max_block_size=self.actual_target_max_block_size, + ) + gen = actor.submit.options( + num_returns="streaming", + name=self.name, + **self._ray_actor_task_remote_args, + ).remote( + self.data_context, + ctx, + *input_blocks, + **self.get_map_task_kwargs(), + ) + + def _task_done_callback(actor_to_return): + # Return the actor that was running the task to the pool. + self._actor_pool.return_actor(actor_to_return) + # Dipsatch more tasks. + self._dispatch_tasks() + + from functools import partial + + self._submit_data_task( + gen, bundle, partial(_task_done_callback, actor_to_return=actor) + ) + + def _refresh_actor_cls(self): + """When `self._ray_remote_args_fn` is specified, this method should + be called prior to initializing the new worker in order to get new + remote args passed to the worker. It updates `self.cls` with the same + `_MapWorker` class, but with the new remote args from + `self._ray_remote_args_fn`.""" + assert self._ray_remote_args_fn, "_ray_remote_args_fn must be provided" + remote_args = self._ray_remote_args.copy() + new_remote_args = self._ray_remote_args_fn() + + # Override args from user-defined remote args function. + new_and_overriden_remote_args = {} + for k, v in new_remote_args.items(): + remote_args[k] = v + new_and_overriden_remote_args[k] = v + self._cls = ray.remote(**remote_args)(_MapWorker) + return new_and_overriden_remote_args + + def all_inputs_done(self): + # Call base implementation to handle any leftover bundles. This may or may not + # trigger task dispatch. + super().all_inputs_done() + + # Mark inputs as done so future task dispatch will kill all inactive workers + # once the bundle queue is exhausted. + self._inputs_done = True + + def shutdown(self): + # We kill all actors in the pool on shutdown, even if they are busy doing work. + self._actor_pool.kill_all_actors() + super().shutdown() + + # Warn if the user specified a batch or block size that prevents full + # parallelization across the actor pool. We only know this information after + # execution has completed. + min_workers = self._actor_pool.min_size() + if len(self._output_metadata) < min_workers: + # The user created a stream that has too few blocks to begin with. + logger.warning( + "To ensure full parallelization across an actor pool of size " + f"{min_workers}, the Dataset should consist of at least " + f"{min_workers} distinct blocks. Consider increasing " + "the parallelism when creating the Dataset." + ) + + def progress_str(self) -> str: + if self._actor_locality_enabled: + return locality_string( + self._actor_pool._locality_hits, + self._actor_pool._locality_misses, + ) + return "[locality off]" + + def base_resource_usage(self) -> ExecutionResources: + min_workers = self._actor_pool.min_size() + return ExecutionResources( + cpu=self._ray_remote_args.get("num_cpus", 0) * min_workers, + gpu=self._ray_remote_args.get("num_gpus", 0) * min_workers, + ) + + def current_processor_usage(self) -> ExecutionResources: + # Both pending and running actors count towards our current resource usage. + num_active_workers = self._actor_pool.current_size() + return ExecutionResources( + cpu=self._ray_remote_args.get("num_cpus", 0) * num_active_workers, + gpu=self._ray_remote_args.get("num_gpus", 0) * num_active_workers, + ) + + def pending_processor_usage(self) -> ExecutionResources: + # Both pending and restarting actors count towards pending processor usage + num_pending_workers = ( + self._actor_pool.num_pending_actors() + + self._actor_pool.num_restarting_actors() + ) + return ExecutionResources( + cpu=self._ray_remote_args.get("num_cpus", 0) * num_pending_workers, + gpu=self._ray_remote_args.get("num_gpus", 0) * num_pending_workers, + ) + + def incremental_resource_usage(self) -> ExecutionResources: + # Submitting tasks to existing actors doesn't require additional + # CPU/GPU resources. + return ExecutionResources( + cpu=0, + gpu=0, + object_store_memory=self._metrics.obj_store_mem_max_pending_output_per_task + or 0, + ) + + def _extra_metrics(self) -> Dict[str, Any]: + res = {} + if self._actor_locality_enabled: + res["locality_hits"] = self._actor_pool._locality_hits + res["locality_misses"] = self._actor_pool._locality_misses + res["pending_actors"] = self._actor_pool.num_pending_actors() + res["restarting_actors"] = self._actor_pool.num_restarting_actors() + return res + + @staticmethod + def _apply_default_remote_args( + ray_remote_args: Dict[str, Any], data_context: DataContext + ) -> Dict[str, Any]: + """Apply defaults to the actor creation remote args.""" + ray_remote_args = ray_remote_args.copy() + if "scheduling_strategy" not in ray_remote_args: + ray_remote_args["scheduling_strategy"] = data_context.scheduling_strategy + # Enable actor fault tolerance by default, with infinite actor recreations and + # up to N retries per task. The user can customize this in map_batches via + # extra kwargs (e.g., map_batches(..., max_restarts=0) to disable). + if "max_restarts" not in ray_remote_args: + ray_remote_args["max_restarts"] = -1 + if ( + "max_task_retries" not in ray_remote_args + and ray_remote_args.get("max_restarts") != 0 + ): + ray_remote_args["max_task_retries"] = -1 + return ray_remote_args + + def get_autoscaling_actor_pools(self) -> List[AutoscalingActorPool]: + return [self._actor_pool] + + def update_resource_usage(self) -> None: + """Updates resources usage.""" + for actor in self._actor_pool.get_running_actor_refs(): + actor_state = actor._get_local_state() + if actor_state is None: + # actor._get_local_state can return None if the state is Unknown + continue + elif actor_state != gcs_pb2.ActorTableData.ActorState.ALIVE: + # The actors can be either ALIVE or RESTARTING here because they will + # be restarted indefinitely until execution finishes. + assert actor_state == gcs_pb2.ActorTableData.ActorState.RESTARTING + self._actor_pool.update_running_actor_state(actor, True) + else: + self._actor_pool.update_running_actor_state(actor, False) + + def actor_info_progress_str(self) -> str: + """Returns Actor progress strings for Alive, Restarting and Pending Actors.""" + return self._actor_pool.actor_info_progress_str() + + +class _MapWorker: + """An actor worker for MapOperator.""" + + def __init__( + self, + ctx: DataContext, + src_fn_name: str, + map_transformer: MapTransformer, + ): + DataContext._set_current(ctx) + self.src_fn_name: str = src_fn_name + self._map_transformer = map_transformer + # Initialize state for this actor. + self._map_transformer.init() + + def get_location(self) -> NodeIdStr: + return ray.get_runtime_context().get_node_id() + + def submit( + self, + data_context: DataContext, + ctx: TaskContext, + *blocks: Block, + **kwargs: Dict[str, Any], + ) -> Iterator[Union[Block, List[BlockMetadata]]]: + yield from _map_task( + self._map_transformer, + data_context, + ctx, + *blocks, + **kwargs, + ) + + def __repr__(self): + return f"MapWorker({self.src_fn_name})" + + +@dataclass +class _ActorState: + """Actor state""" + + # Number of tasks in flight per actor + num_tasks_in_flight: int + + # Node id of each ready actor + actor_location: str + + # Is Actor state restarting or alive + is_restarting: bool + + +class _ActorPool(AutoscalingActorPool): + """A pool of actors for map task execution. + + This class is in charge of tracking the number of in-flight tasks per actor, + providing the least heavily loaded actor to the operator, and killing idle + actors when the operator is done submitting work to the pool. + """ + + def __init__( + self, + compute_strategy: ActorPoolStrategy, + create_actor_fn: Callable[[], Tuple[ActorHandle, ObjectRef[Any]]], + per_actor_resource_usage: ExecutionResources, + ): + self._min_size: int = compute_strategy.min_size + self._max_size: int = compute_strategy.max_size + self._max_tasks_in_flight: int = ( + compute_strategy.max_tasks_in_flight_per_actor + or DEFAULT_MAX_TASKS_IN_FLIGHT + ) + self._create_actor_fn = create_actor_fn + self._per_actor_resource_usage = per_actor_resource_usage + assert self._min_size >= 1 + assert self._max_size >= self._min_size + assert self._max_tasks_in_flight >= 1 + assert self._create_actor_fn is not None + + # Actors that have started running, including alive and restarting actors. + self._running_actors: Dict[ray.actor.ActorHandle, _ActorState] = {} + # Actors that are not yet ready (still pending creation). + self._pending_actors: Dict[ObjectRef, ray.actor.ActorHandle] = {} + # Whether actors that become idle should be eagerly killed. This is False until + # the first call to kill_idle_actors(). + self._should_kill_idle_actors = False + # Track locality matching stats. + self._locality_hits: int = 0 + self._locality_misses: int = 0 + + # === Overriding methods of AutoscalingActorPool === + + def min_size(self) -> int: + return self._min_size + + def max_size(self) -> int: + return self._max_size + + def current_size(self) -> int: + return self.num_pending_actors() + self.num_running_actors() + + def num_running_actors(self) -> int: + return len(self._running_actors) + + def num_restarting_actors(self) -> int: + """Restarting actors are all the running actors not in ALIVE state.""" + return sum( + actor_state.is_restarting for actor_state in self._running_actors.values() + ) + + def num_active_actors(self) -> int: + """Active actors are all the running actors with inflight tasks.""" + return sum( + 1 if actor_state.num_tasks_in_flight > 0 else 0 + for actor_state in self._running_actors.values() + ) + + def num_alive_actors(self) -> int: + """Alive actors are all the running actors in ALIVE state.""" + return sum( + not actor_state.is_restarting + for actor_state in self._running_actors.values() + ) + + def num_pending_actors(self) -> int: + return len(self._pending_actors) + + def max_tasks_in_flight_per_actor(self) -> int: + return self._max_tasks_in_flight + + def current_in_flight_tasks(self) -> int: + return sum( + actor_state.num_tasks_in_flight + for actor_state in self._running_actors.values() + ) + + def scale_up(self, num_actors: int) -> int: + for _ in range(num_actors): + actor, ready_ref = self._create_actor_fn() + self.add_pending_actor(actor, ready_ref) + return num_actors + + def scale_down(self, num_actors: int) -> int: + num_killed = 0 + for _ in range(num_actors): + if self.kill_inactive_actor(): + num_killed += 1 + return num_killed + + # === End of overriding methods of AutoscalingActorPool === + + def update_running_actor_state( + self, actor: ray.actor.ActorHandle, is_restarting: bool + ): + """Update running actor state. + + Args: + actor: The running actor that needs state update. + is_restarting: Whether running actor is restarting or alive. + """ + assert actor in self._running_actors + self._running_actors[actor].is_restarting = is_restarting + + def add_pending_actor(self, actor: ray.actor.ActorHandle, ready_ref: ray.ObjectRef): + """Adds a pending actor to the pool. + + This actor won't be pickable until it is marked as running via a + pending_to_running() call. + + Args: + actor: The not-yet-ready actor to add as pending to the pool. + ready_ref: The ready future for the actor. + """ + # The caller shouldn't add new actors to the pool after invoking + # kill_inactive_actors(). + assert not self._should_kill_idle_actors + self._pending_actors[ready_ref] = actor + + def pending_to_running(self, ready_ref: ray.ObjectRef) -> bool: + """Mark the actor corresponding to the provided ready future as running, making + the actor pickable. + + Args: + ready_ref: The ready future for the actor that we wish to mark as running. + + Returns: + Whether the actor was still pending. This can return False if the actor had + already been killed. + """ + if ready_ref not in self._pending_actors: + # The actor has been removed from the pool before becoming running. + return False + actor = self._pending_actors.pop(ready_ref) + self._running_actors[actor] = _ActorState( + num_tasks_in_flight=0, + actor_location=ray.get(ready_ref), + is_restarting=False, + ) + return True + + def pick_actor( + self, locality_hint: Optional[RefBundle] = None + ) -> Optional[ray.actor.ActorHandle]: + """Picks an actor for task submission based on busyness and locality. + + None will be returned if all actors are either at capacity (according to + max_tasks_in_flight) or are still pending. + + Args: + locality_hint: Try to pick an actor that is local for this bundle. + """ + if not self._running_actors: + # Actor pool is empty or all actors are still pending. + return None + + if locality_hint: + preferred_loc = self._get_location(locality_hint) + else: + preferred_loc = None + + # Filter out actors that are invalid, i.e. actors with number of tasks in + # flight >= _max_tasks_in_flight or actor_state is not ALIVE. + valid_actors = [ + actor + for actor in self._running_actors + if self._running_actors[actor].num_tasks_in_flight + < self._max_tasks_in_flight + and not self._running_actors[actor].is_restarting + ] + + if not valid_actors: + # All actors are at capacity or actor state is not ALIVE. + return None + + def penalty_key(actor): + """Returns the key that should be minimized for the best actor. + + We prioritize actors with argument locality, and those that are not busy, + in that order. + """ + busyness = self._running_actors[actor].num_tasks_in_flight + requires_remote_fetch = ( + self._running_actors[actor].actor_location != preferred_loc + ) + return requires_remote_fetch, busyness + + # Pick the best valid actor based on the penalty key + actor = min(valid_actors, key=penalty_key) + + if locality_hint: + if self._running_actors[actor].actor_location == preferred_loc: + self._locality_hits += 1 + else: + self._locality_misses += 1 + self._running_actors[actor].num_tasks_in_flight += 1 + return actor + + def return_actor(self, actor: ray.actor.ActorHandle): + """Returns the provided actor to the pool.""" + assert actor in self._running_actors + assert self._running_actors[actor].num_tasks_in_flight > 0 + self._running_actors[actor].num_tasks_in_flight -= 1 + if ( + self._should_kill_idle_actors + and self._running_actors[actor].num_tasks_in_flight == 0 + ): + self._remove_actor(actor) + + def get_pending_actor_refs(self) -> List[ray.ObjectRef]: + return list(self._pending_actors.keys()) + + def get_running_actor_refs(self) -> List[ray.ObjectRef]: + return list(self._running_actors.keys()) + + def num_idle_actors(self) -> int: + """Return the number of idle actors in the pool.""" + return sum( + 1 if running_actor.num_tasks_in_flight == 0 else 0 + for running_actor in self._running_actors.values() + ) + + def num_free_slots(self) -> int: + """Return the number of free slots for task execution.""" + if not self._running_actors: + return 0 + return sum( + max(0, self._max_tasks_in_flight - running_actor.num_tasks_in_flight) + for running_actor in self._running_actors.values() + ) + + def kill_inactive_actor(self) -> bool: + """Kills a single pending or idle actor, if any actors are pending/idle. + + Returns whether an inactive actor was actually killed. + """ + # We prioritize killing pending actors over idle actors to reduce actor starting + # churn. + killed = self._maybe_kill_pending_actor() + if not killed: + # If no pending actor was killed, so kill actor. + killed = self._maybe_kill_idle_actor() + return killed + + def _maybe_kill_pending_actor(self) -> bool: + if self._pending_actors: + # At least one pending actor, so kill first one. + ready_ref = next(iter(self._pending_actors.keys())) + self._remove_actor(self._pending_actors[ready_ref]) + del self._pending_actors[ready_ref] + return True + # No pending actors, so indicate to the caller that no actors were killed. + return False + + def _maybe_kill_idle_actor(self) -> bool: + for actor, running_actor in self._running_actors.items(): + if running_actor.num_tasks_in_flight == 0: + # At least one idle actor, so kill first one found. + self._remove_actor(actor) + return True + # No idle actors, so indicate to the caller that no actors were killed. + return False + + def kill_all_inactive_actors(self): + """Kills all currently inactive actors and ensures that all actors that become + idle in the future will be eagerly killed. + + This is called once the operator is done submitting work to the pool, and this + function is idempotent. Adding new pending actors after calling this function + will raise an error. + """ + self._kill_all_pending_actors() + self._kill_all_idle_actors() + + def kill_all_actors(self): + """Kills all actors, including running/active actors. + + This is called once the operator is shutting down. + """ + self._kill_all_pending_actors() + self._kill_all_running_actors() + + def _kill_all_pending_actors(self): + for _, actor in self._pending_actors.items(): + self._remove_actor(actor) + self._pending_actors.clear() + + def _kill_all_idle_actors(self): + idle_actors = [ + actor + for actor, running_actor in self._running_actors.items() + if running_actor.num_tasks_in_flight == 0 + ] + for actor in idle_actors: + self._remove_actor(actor) + self._should_kill_idle_actors = True + + def _kill_all_running_actors(self): + actors = list(self._running_actors.keys()) + for actor in actors: + self._remove_actor(actor) + + def _remove_actor(self, actor: ray.actor.ActorHandle): + """Remove the given actor from the pool.""" + # NOTE: we remove references to the actor and let ref counting + # garbage collect the actor, instead of using ray.kill. + # Because otherwise the actor cannot be restarted upon lineage reconstruction. + if actor in self._running_actors: + del self._running_actors[actor] + + def _get_location(self, bundle: RefBundle) -> Optional[NodeIdStr]: + """Ask Ray for the node id of the given bundle. + + This method may be overriden for testing. + + Returns: + A node id associated with the bundle, or None if unknown. + """ + return bundle.get_cached_location() + + def actor_info_progress_str(self) -> str: + """Returns Actor progress strings for Alive, Restarting and Pending Actors.""" + alive = self.num_alive_actors() + pending = self.num_pending_actors() + restarting = self.num_restarting_actors() + total = alive + pending + restarting + if total == alive: + return f"; Actors: {total}" + else: + return ( + f"; Actors: {total} (alive {alive}, restarting {restarting}, " + f"pending {pending})" + ) + + def per_actor_resource_usage(self) -> ExecutionResources: + """Per actor resource usage.""" + return self._per_actor_resource_usage diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/aggregate_num_rows.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/aggregate_num_rows.py new file mode 100644 index 0000000000000000000000000000000000000000..59b64d1a40208b0a66de8d8af762703a113f5ebc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/aggregate_num_rows.py @@ -0,0 +1,62 @@ +import ray +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.execution.interfaces import PhysicalOperator, RefBundle +from ray.data._internal.stats import StatsDict +from ray.data.block import BlockAccessor +from ray.data.context import DataContext + + +class AggregateNumRows(PhysicalOperator): + """Count number of rows in input bundles. + + This operator aggregates the number of rows in input bundles using the bundles' + block metadata. It outputs a single row with the specified column name. + """ + + def __init__( + self, + input_dependencies, + data_context: DataContext, + column_name: str, + ): + super().__init__( + "AggregateNumRows", + input_dependencies, + data_context, + target_max_block_size=None, + ) + + self._column_name = column_name + + self._num_rows = 0 + self._has_outputted = False + + def has_next(self) -> bool: + return self._inputs_complete and not self._has_outputted + + def _get_next_inner(self) -> RefBundle: + assert self._inputs_complete + + builder = DelegatingBlockBuilder() + builder.add({self._column_name: self._num_rows}) + block = builder.build() + block_ref = ray.put(block) + + metadata = BlockAccessor.for_block(block).get_metadata() + bundle = RefBundle([(block_ref, metadata)], owns_blocks=True) + + self._has_outputted = True + return bundle + + def get_stats(self) -> StatsDict: + return {} + + def _add_input_inner(self, refs, input_index) -> None: + assert refs.num_rows() is not None + self._num_rows += refs.num_rows() + + def throttling_disabled(self) -> bool: + return True + + def implements_accurate_memory_accounting(self) -> bool: + return True diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/base_physical_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/base_physical_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..1ad9b069e87be3fe2acd9b47cbf245dd9f44db0d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/base_physical_operator.py @@ -0,0 +1,176 @@ +from typing import List, Optional + +from ray.data._internal.execution.interfaces import ( + AllToAllTransformFn, + PhysicalOperator, + RefBundle, + TaskContext, +) +from ray.data._internal.logical.interfaces import LogicalOperator +from ray.data._internal.progress_bar import ProgressBar +from ray.data._internal.stats import StatsDict +from ray.data.context import DataContext + + +class OneToOneOperator(PhysicalOperator): + """An operator that has one input and one output dependency. + + This operator serves as the base for map, filter, limit, etc. + """ + + def __init__( + self, + name: str, + input_op: PhysicalOperator, + data_context: DataContext, + target_max_block_size: Optional[int], + ): + """Create a OneToOneOperator. + Args: + input_op: Operator generating input data for this op. + name: The name of this operator. + target_max_block_size: The target maximum number of bytes to + include in an output block. + """ + super().__init__(name, [input_op], data_context, target_max_block_size) + + @property + def input_dependency(self) -> PhysicalOperator: + return self.input_dependencies[0] + + +class AllToAllOperator(PhysicalOperator): + """A blocking operator that executes once its inputs are complete. + + This operator implements distributed sort / shuffle operations, etc. + """ + + def __init__( + self, + bulk_fn: AllToAllTransformFn, + input_op: PhysicalOperator, + data_context: DataContext, + target_max_block_size: Optional[int], + num_outputs: Optional[int] = None, + sub_progress_bar_names: Optional[List[str]] = None, + name: str = "AllToAll", + ): + """Create an AllToAllOperator. + Args: + bulk_fn: The blocking transformation function to run. The inputs are the + list of input ref bundles, and the outputs are the output ref bundles + and a stats dict. + input_op: Operator generating input data for this op. + num_outputs: The number of expected output bundles for progress bar. + sub_progress_bar_names: The names of internal sub progress bars. + name: The name of this operator. + """ + self._bulk_fn = bulk_fn + self._next_task_index = 0 + self._num_outputs = num_outputs + self._output_rows = 0 + self._sub_progress_bar_names = sub_progress_bar_names + self._sub_progress_bar_dict = None + self._input_buffer: List[RefBundle] = [] + self._output_buffer: List[RefBundle] = [] + self._stats: StatsDict = {} + super().__init__(name, [input_op], data_context, target_max_block_size) + + def num_outputs_total(self) -> Optional[int]: + return ( + self._num_outputs + if self._num_outputs + else self.input_dependencies[0].num_outputs_total() + ) + + def num_output_rows_total(self) -> Optional[int]: + return ( + self._output_rows + if self._output_rows + else self.input_dependencies[0].num_output_rows_total() + ) + + def _add_input_inner(self, refs: RefBundle, input_index: int) -> None: + assert not self.completed() + assert input_index == 0, input_index + self._input_buffer.append(refs) + + def all_inputs_done(self) -> None: + ctx = TaskContext( + task_idx=self._next_task_index, + sub_progress_bar_dict=self._sub_progress_bar_dict, + target_max_block_size=self.actual_target_max_block_size, + ) + self._output_buffer, self._stats = self._bulk_fn(self._input_buffer, ctx) + self._next_task_index += 1 + self._input_buffer.clear() + super().all_inputs_done() + + def has_next(self) -> bool: + return len(self._output_buffer) > 0 + + def _get_next_inner(self) -> RefBundle: + bundle = self._output_buffer.pop(0) + self._output_rows += bundle.num_rows() + return bundle + + def get_stats(self) -> StatsDict: + return self._stats + + def get_transformation_fn(self) -> AllToAllTransformFn: + return self._bulk_fn + + def progress_str(self) -> str: + return f"{self.num_output_rows_total() or 0} rows output" + + def initialize_sub_progress_bars(self, position: int) -> int: + """Initialize all internal sub progress bars, and return the number of bars.""" + if self._sub_progress_bar_names is not None: + self._sub_progress_bar_dict = {} + for name in self._sub_progress_bar_names: + bar = ProgressBar( + name, + self.num_output_rows_total() or 1, + unit="row", + position=position, + ) + # NOTE: call `set_description` to trigger the initial print of progress + # bar on console. + bar.set_description(f" *- {name}") + self._sub_progress_bar_dict[name] = bar + position += 1 + return len(self._sub_progress_bar_dict) + else: + return 0 + + def close_sub_progress_bars(self): + """Close all internal sub progress bars.""" + if self._sub_progress_bar_dict is not None: + for sub_bar in self._sub_progress_bar_dict.values(): + sub_bar.close() + + def supports_fusion(self): + return True + + +class NAryOperator(PhysicalOperator): + """An operator that has multiple input dependencies and one output. + + This operator serves as the base for union, zip, etc. + """ + + def __init__( + self, + data_context: DataContext, + *input_ops: LogicalOperator, + ): + """Create a OneToOneOperator. + Args: + input_op: Operator generating input data for this op. + name: The name of this operator. + """ + input_names = ", ".join([op._name for op in input_ops]) + op_name = f"{self.__class__.__name__}({input_names})" + super().__init__( + op_name, list(input_ops), data_context, target_max_block_size=None + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/input_data_buffer.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/input_data_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..195711e35cf1abbc720c8ca8923452a3c86d8a1d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/input_data_buffer.py @@ -0,0 +1,89 @@ +from typing import Callable, List, Optional + +from ray.data._internal.execution.interfaces import ( + ExecutionOptions, + PhysicalOperator, + RefBundle, +) +from ray.data._internal.stats import StatsDict +from ray.data.context import DataContext + + +class InputDataBuffer(PhysicalOperator): + """Defines the input data for the operator DAG. + + For example, this may hold cached blocks from a previous Dataset execution, or + the arguments for read tasks. + """ + + def __init__( + self, + data_context: DataContext, + input_data: Optional[List[RefBundle]] = None, + input_data_factory: Optional[Callable[[int], List[RefBundle]]] = None, + num_output_blocks: Optional[int] = None, + ): + """Create an InputDataBuffer. + + Args: + input_data: The list of bundles to output from this operator. + input_data_factory: The factory to get input data, if input_data is None. + num_output_blocks: The number of output blocks. If not specified, progress + bars total will be set based on num output bundles instead. + """ + super().__init__("Input", [], data_context, target_max_block_size=None) + if input_data is not None: + assert input_data_factory is None + # Copy the input data to avoid mutating the original list. + self._input_data = input_data[:] + self._is_input_initialized = True + self._initialize_metadata() + else: + # Initialize input lazily when execution is started. + assert input_data_factory is not None + self._input_data_factory = input_data_factory + self._is_input_initialized = False + self._input_data_index = 0 + + def start(self, options: ExecutionOptions) -> None: + if not self._is_input_initialized: + self._input_data = self._input_data_factory( + self.actual_target_max_block_size + ) + self._is_input_initialized = True + self._initialize_metadata() + # InputDataBuffer does not take inputs from other operators, + # so we record input metrics here + for bundle in self._input_data: + self._metrics.on_input_received(bundle) + super().start(options) + + def has_next(self) -> bool: + return self._input_data_index < len(self._input_data) + + def _get_next_inner(self) -> RefBundle: + # We can't pop the input data. If we do, Ray might garbage collect the block + # references, and Ray won't be able to reconstruct downstream objects. + bundle = self._input_data[self._input_data_index] + self._input_data_index += 1 + return bundle + + def get_stats(self) -> StatsDict: + return {} + + def _add_input_inner(self, refs, input_index) -> None: + raise ValueError("Inputs are not allowed for this operator.") + + def _initialize_metadata(self): + assert self._input_data is not None and self._is_input_initialized + self._estimated_num_output_bundles = len(self._input_data) + + block_metadata = [] + for bundle in self._input_data: + block_metadata.extend(bundle.metadata) + self._stats = { + "input": block_metadata, + } + + def implements_accurate_memory_accounting(self) -> bool: + return True diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/limit_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/limit_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..47e93ae3f22c40966507490eeaaedddd466ec2b8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/limit_operator.py @@ -0,0 +1,133 @@ +import copy +from collections import deque +from typing import Deque, List, Optional, Tuple + +import ray +from ray.data._internal.execution.interfaces import PhysicalOperator, RefBundle +from ray.data._internal.execution.operators.base_physical_operator import ( + OneToOneOperator, +) +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.stats import StatsDict +from ray.data.block import Block, BlockAccessor, BlockMetadata +from ray.data.context import DataContext +from ray.types import ObjectRef + + +class LimitOperator(OneToOneOperator): + """Physical operator for limit.""" + + def __init__( + self, + limit: int, + input_op: PhysicalOperator, + data_context: DataContext, + ): + self._limit = limit + self._consumed_rows = 0 + self._buffer: Deque[RefBundle] = deque() + self._name = f"limit={limit}" + self._output_metadata: List[BlockMetadata] = [] + self._cur_output_bundles = 0 + super().__init__(self._name, input_op, data_context, target_max_block_size=None) + if self._limit <= 0: + self.mark_execution_completed() + + def _limit_reached(self) -> bool: + return self._consumed_rows >= self._limit + + def _add_input_inner(self, refs: RefBundle, input_index: int) -> None: + assert not self.completed() + assert input_index == 0, input_index + if self._limit_reached(): + return + out_blocks: List[ObjectRef[Block]] = [] + out_metadata: List[BlockMetadata] = [] + for block, metadata in refs.blocks: + num_rows = metadata.num_rows + assert num_rows is not None + if self._consumed_rows + num_rows <= self._limit: + out_blocks.append(block) + out_metadata.append(metadata) + self._output_metadata.append(metadata) + self._consumed_rows += num_rows + else: + # Slice the last block. + def slice_fn(block, metadata, num_rows) -> Tuple[Block, BlockMetadata]: + block = BlockAccessor.for_block(block).slice(0, num_rows, copy=True) + metadata = copy.deepcopy(metadata) + metadata.num_rows = num_rows + metadata.size_bytes = BlockAccessor.for_block(block).size_bytes() + return block, metadata + + block, metadata_ref = cached_remote_fn( + slice_fn, num_cpus=0, num_returns=2 + ).remote( + block, + metadata, + self._limit - self._consumed_rows, + ) + out_blocks.append(block) + metadata = ray.get(metadata_ref) + out_metadata.append(metadata) + self._output_metadata.append(metadata) + self._consumed_rows = self._limit + break + self._cur_output_bundles += 1 + out_refs = RefBundle( + list(zip(out_blocks, out_metadata)), + owns_blocks=refs.owns_blocks, + ) + self._buffer.append(out_refs) + self._metrics.on_output_queued(out_refs) + if self._limit_reached(): + self.mark_execution_completed() + + # We cannot estimate if we have only consumed empty blocks, + # or if the input dependency's total number of output bundles is unknown. + num_inputs = self.input_dependencies[0].num_outputs_total() + if self._consumed_rows > 0 and num_inputs is not None: + # Estimate number of output bundles + # Check the case where _limit > # of input rows + estimated_total_output_rows = min( + self._limit, self._consumed_rows / self._cur_output_bundles * num_inputs + ) + # _consumed_rows / _limit is roughly equal to + # _cur_output_bundles / total output blocks + self._estimated_num_output_bundles = round( + estimated_total_output_rows + / self._consumed_rows + * self._cur_output_bundles + ) + + def has_next(self) -> bool: + return len(self._buffer) > 0 + + def _get_next_inner(self) -> RefBundle: + output = self._buffer.popleft() + self._metrics.on_output_dequeued(output) + return output + + def get_stats(self) -> StatsDict: + return {self._name: self._output_metadata} + + def num_outputs_total(self) -> Optional[int]: + # Before execution is completed, we don't know how many output + # bundles we will have. We estimate based off the consumption so far. + if self._execution_completed: + return self._cur_output_bundles + return self._estimated_num_output_bundles + + def num_output_rows_total(self) -> Optional[int]: + # The total number of rows is simply the limit or the number + # of input rows, whichever is smaller + input_num_rows = self.input_dependencies[0].num_output_rows_total() + if input_num_rows is None: + return None + return min(self._limit, input_num_rows) + + def throttling_disabled(self) -> bool: + return True + + def implements_accurate_memory_accounting(self) -> bool: + return True diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..49169ca750ca7db9e84952b283fe738b1a53e8e1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py @@ -0,0 +1,711 @@ +import copy +import functools +import itertools +import logging +from abc import ABC, abstractmethod +from collections import defaultdict, deque +from typing import ( + Any, + Callable, + Deque, + Dict, + Iterator, + List, + Optional, + Set, + Tuple, + Union, +) + +import ray +from ray import ObjectRef +from ray._raylet import ObjectRefGenerator +from ray.data._internal.compute import ( + ActorPoolStrategy, + ComputeStrategy, + TaskPoolStrategy, +) +from ray.data._internal.execution.interfaces import ( + ExecutionOptions, + ExecutionResources, + PhysicalOperator, + RefBundle, + TaskContext, +) +from ray.data._internal.execution.interfaces.physical_operator import ( + DataOpTask, + MetadataOpTask, + OpTask, +) +from ray.data._internal.execution.operators.base_physical_operator import ( + OneToOneOperator, +) +from ray.data._internal.execution.operators.map_transformer import ( + ApplyAdditionalSplitToOutputBlocks, + MapTransformer, +) +from ray.data._internal.stats import StatsDict +from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata +from ray.data.context import DataContext +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + +logger = logging.getLogger(__name__) + + +class MapOperator(OneToOneOperator, ABC): + """A streaming operator that maps input bundles 1:1 to output bundles. + + This operator implements the distributed map operation, supporting both task + and actor compute strategies. + """ + + def __init__( + self, + map_transformer: MapTransformer, + input_op: PhysicalOperator, + data_context: DataContext, + name: str, + target_max_block_size: Optional[int], + min_rows_per_bundle: Optional[int], + supports_fusion: bool, + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]], + ray_remote_args: Optional[Dict[str, Any]], + ): + # NOTE: This constructor should not be called directly; use MapOperator.create() + # instead. + # NOTE: This constructor must be called by subclasses. + + self._map_transformer = map_transformer + self._supports_fusion = supports_fusion + self._ray_remote_args = _canonicalize_ray_remote_args(ray_remote_args or {}) + self._ray_remote_args_fn = ray_remote_args_fn + self._ray_remote_args_factory_actor_locality = None + self._remote_args_for_metrics = copy.deepcopy(self._ray_remote_args) + + # Bundles block references up to the min_rows_per_bundle target. + self._block_ref_bundler = _BlockRefBundler(min_rows_per_bundle) + + # Queue for task outputs, either ordered or unordered (this is set by start()). + self._output_queue: _OutputQueue = None + # Output metadata, added to on get_next(). + self._output_metadata: List[BlockMetadata] = [] + # All active `DataOpTask`s. + self._data_tasks: Dict[int, DataOpTask] = {} + self._next_data_task_idx = 0 + # All active `MetadataOpTask`s. + self._metadata_tasks: Dict[int, MetadataOpTask] = {} + self._next_metadata_task_idx = 0 + # Keep track of all finished streaming generators. + super().__init__(name, input_op, data_context, target_max_block_size) + + # If set, then all output blocks will be split into + # this many sub-blocks. This is to avoid having + # too-large blocks, which may reduce parallelism for + # the subsequent operator. + self._additional_split_factor = None + # Callback functions that generate additional task kwargs + # for the map task. + self._map_task_kwargs_fns: List[Callable[[], Dict[str, Any]]] = [] + + def add_map_task_kwargs_fn(self, map_task_kwargs_fn: Callable[[], Dict[str, Any]]): + """Add a callback function that generates additional kwargs for the map tasks. + In the map tasks, the kwargs can be accessible via `TaskContext.kwargs`. + """ + self._map_task_kwargs_fns.append(map_task_kwargs_fn) + + def get_map_task_kwargs(self) -> Dict[str, Any]: + """Get the kwargs for the map task. + Subclasses should pass the returned kwargs to the map tasks. + In the map tasks, the kwargs can be accessible via `TaskContext.kwargs`. + """ + kwargs = {} + for fn in self._map_task_kwargs_fns: + kwargs.update(fn()) + return kwargs + + def get_additional_split_factor(self) -> int: + if self._additional_split_factor is None: + return 1 + return self._additional_split_factor + + def set_additional_split_factor(self, k: int): + self._additional_split_factor = k + + @property + def name(self) -> str: + name = super().name + if self._additional_split_factor is not None: + name += f"->SplitBlocks({self._additional_split_factor})" + return name + + @classmethod + def create( + cls, + map_transformer: MapTransformer, + input_op: PhysicalOperator, + data_context: DataContext, + target_max_block_size: Optional[int] = None, + name: str = "Map", + # TODO(ekl): slim down ComputeStrategy to only specify the compute + # config and not contain implementation code. + compute_strategy: Optional[ComputeStrategy] = None, + min_rows_per_bundle: Optional[int] = None, + supports_fusion: bool = True, + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + ray_remote_args: Optional[Dict[str, Any]] = None, + ) -> "MapOperator": + """Create a MapOperator. + + This factory creates the MapOperator pool implementation that corresponds to the + compute argument: + - If None or TaskPoolStrategy -> TaskPoolMapOperator + - If ActorPoolStrategy -> ActorPoolMapOperator + + Args: + transform_fn: The function to apply to each ref bundle input. + input_op: Operator generating input data for this op. + init_fn: The callable class to instantiate if using ActorPoolMapOperator. + name: The name of this operator. + compute_strategy: Customize the compute strategy for this op. + target_max_block_size: The target maximum number of bytes to + include in an output block. + min_rows_per_bundle: The number of rows to gather per batch passed to the + transform_fn, or None to use the block size. Setting the batch size is + important for the performance of GPU-accelerated transform functions. + The actual rows passed may be less if the dataset is small. + supports_fusion: Whether this operator supports fusion with other operators. + ray_remote_args_fn: A function that returns a dictionary of remote args + passed to each map worker. The purpose of this argument is to generate + dynamic arguments for each actor/task, and will be called each time + prior to initializing the worker. Args returned from this dict will + always override the args in ``ray_remote_args``. Note: this is an + advanced, experimental feature. + ray_remote_args: Customize the :func:`ray.remote` args for this op's tasks. + """ + if compute_strategy is None: + compute_strategy = TaskPoolStrategy() + + if isinstance(compute_strategy, TaskPoolStrategy): + from ray.data._internal.execution.operators.task_pool_map_operator import ( + TaskPoolMapOperator, + ) + + return TaskPoolMapOperator( + map_transformer, + input_op, + data_context, + name=name, + target_max_block_size=target_max_block_size, + min_rows_per_bundle=min_rows_per_bundle, + concurrency=compute_strategy.size, + supports_fusion=supports_fusion, + ray_remote_args_fn=ray_remote_args_fn, + ray_remote_args=ray_remote_args, + ) + elif isinstance(compute_strategy, ActorPoolStrategy): + from ray.data._internal.execution.operators.actor_pool_map_operator import ( + ActorPoolMapOperator, + ) + + return ActorPoolMapOperator( + map_transformer, + input_op, + data_context, + target_max_block_size=target_max_block_size, + compute_strategy=compute_strategy, + name=name, + min_rows_per_bundle=min_rows_per_bundle, + supports_fusion=supports_fusion, + ray_remote_args_fn=ray_remote_args_fn, + ray_remote_args=ray_remote_args, + ) + else: + raise ValueError(f"Unsupported execution strategy {compute_strategy}") + + def start(self, options: "ExecutionOptions"): + super().start(options) + # Create output queue with desired ordering semantics. + if options.preserve_order: + self._output_queue = _OrderedOutputQueue() + else: + self._output_queue = _UnorderedOutputQueue() + + if options.locality_with_output: + if isinstance(options.locality_with_output, list): + locs = options.locality_with_output + else: + locs = [ray.get_runtime_context().get_node_id()] + + class RoundRobinAssign: + def __init__(self, locs): + self.locs = locs + self.i = 0 + + def __call__(self, args): + args = copy.deepcopy(args) + args["scheduling_strategy"] = NodeAffinitySchedulingStrategy( + self.locs[self.i], + soft=True, + _spill_on_unavailable=True, + ) + self.i += 1 + self.i %= len(self.locs) + return args + + self._ray_remote_args_factory_actor_locality = RoundRobinAssign(locs) + + map_transformer = self._map_transformer + # Apply additional block split if needed. + if self.get_additional_split_factor() > 1: + split_transformer = MapTransformer( + [ApplyAdditionalSplitToOutputBlocks(self.get_additional_split_factor())] + ) + map_transformer = map_transformer.fuse(split_transformer) + # Put the function def in the object store to avoid repeated serialization + # in case it's large (i.e., closure captures large objects). + self._map_transformer_ref = ray.put(map_transformer) + + def _add_input_inner(self, refs: RefBundle, input_index: int): + assert input_index == 0, input_index + + # Add RefBundle to the bundler. + self._block_ref_bundler.add_bundle(refs) + self._metrics.on_input_queued(refs) + + if self._block_ref_bundler.has_bundle(): + # The ref bundler combines one or more RefBundles into a new larger + # RefBundle. Rather than dequeuing the new RefBundle, which was never + # enqueued in the first place, we dequeue the original RefBundles. + input_refs, bundled_input = self._block_ref_bundler.get_next_bundle() + for bundle in input_refs: + self._metrics.on_input_dequeued(bundle) + + # If the bundler has a full bundle, add it to the operator's task submission + # queue + self._add_bundled_input(bundled_input) + + def _get_runtime_ray_remote_args( + self, input_bundle: Optional[RefBundle] = None + ) -> Dict[str, Any]: + ray_remote_args = copy.deepcopy(self._ray_remote_args) + + # Override parameters from user provided remote args function. + if self._ray_remote_args_fn: + new_remote_args = self._ray_remote_args_fn() + for k, v in new_remote_args.items(): + ray_remote_args[k] = v + # For tasks with small args, we will use SPREAD by default to optimize for + # compute load-balancing. For tasks with large args, we will use DEFAULT to + # allow the Ray locality scheduler a chance to optimize task placement. + if "scheduling_strategy" not in ray_remote_args: + ctx = self.data_context + if input_bundle and input_bundle.size_bytes() > ctx.large_args_threshold: + ray_remote_args[ + "scheduling_strategy" + ] = ctx.scheduling_strategy_large_args + # Takes precedence over small args case. This is to let users know + # when the large args case is being triggered. + self._remote_args_for_metrics = copy.deepcopy(ray_remote_args) + else: + ray_remote_args["scheduling_strategy"] = ctx.scheduling_strategy + # Only save to metrics if we haven't already done so. + if "scheduling_strategy" not in self._remote_args_for_metrics: + self._remote_args_for_metrics = copy.deepcopy(ray_remote_args) + # This should take precedence over previously set scheduling strategy, as it + # implements actor-based locality overrides. + if self._ray_remote_args_factory_actor_locality: + return self._ray_remote_args_factory_actor_locality(ray_remote_args) + return ray_remote_args + + @abstractmethod + def _add_bundled_input(self, refs: RefBundle): + """Add a pre-bundled upstream output to this operator. + + Unlike the add_input() arg, this RefBundle has already been further bundled by + _block_ref_bundler up to the target size, meaning that this bundle is ready for + task submission. + + This must be implemented by subclasses. + + Args: + refs: The fully-bundled ref bundle that should be added as input. + """ + raise NotImplementedError + + def _submit_data_task( + self, + gen: ObjectRefGenerator, + inputs: RefBundle, + task_done_callback: Optional[Callable[[], None]] = None, + ): + """Submit a new data-handling task.""" + # TODO(hchen): + # 1. Move this to the base PhyscialOperator class. + # 2. This method should only take a block-processing function as input, + # instead of a streaming generator. The logic of submitting ray tasks + # can also be capsulated in the base class. + task_index = self._next_data_task_idx + self._next_data_task_idx += 1 + self._metrics.on_task_submitted(task_index, inputs) + + def _output_ready_callback(task_index, output: RefBundle): + # Since output is streamed, it should only contain one block. + assert len(output) == 1 + self._metrics.on_task_output_generated(task_index, output) + + # Notify output queue that the task has produced an new output. + self._output_queue.notify_task_output_ready(task_index, output) + self._metrics.on_output_queued(output) + + def _task_done_callback(task_index: int, exception: Optional[Exception]): + self._metrics.on_task_finished(task_index, exception) + + # Estimate number of tasks and rows from inputs received and tasks + # submitted so far + upstream_op_num_outputs = self.input_dependencies[0].num_outputs_total() + if upstream_op_num_outputs: + estimated_num_tasks = ( + upstream_op_num_outputs + / self._metrics.num_inputs_received + * self._next_data_task_idx + ) + self._estimated_num_output_bundles = round( + estimated_num_tasks + * self._metrics.num_outputs_of_finished_tasks + / self._metrics.num_tasks_finished + ) + self._estimated_output_num_rows = round( + estimated_num_tasks + * self._metrics.rows_task_outputs_generated + / self._metrics.num_tasks_finished + ) + + self._data_tasks.pop(task_index) + # Notify output queue that this task is complete. + self._output_queue.notify_task_completed(task_index) + if task_done_callback: + task_done_callback() + + self._data_tasks[task_index] = DataOpTask( + task_index, + gen, + lambda output: _output_ready_callback(task_index, output), + functools.partial(_task_done_callback, task_index), + ) + + def _submit_metadata_task( + self, result_ref: ObjectRef, task_done_callback: Callable[[], None] + ): + """Submit a new metadata-handling task.""" + # TODO(hchen): Move this to the base PhyscialOperator class. + task_index = self._next_metadata_task_idx + self._next_metadata_task_idx += 1 + + def _task_done_callback(): + self._metadata_tasks.pop(task_index) + task_done_callback() + + self._metadata_tasks[task_index] = MetadataOpTask( + task_index, result_ref, _task_done_callback + ) + + def get_active_tasks(self) -> List[OpTask]: + return list(self._metadata_tasks.values()) + list(self._data_tasks.values()) + + def all_inputs_done(self): + self._block_ref_bundler.done_adding_bundles() + if self._block_ref_bundler.has_bundle(): + # Handle any leftover bundles in the bundler. + _, bundled_input = self._block_ref_bundler.get_next_bundle() + self._add_bundled_input(bundled_input) + super().all_inputs_done() + + def has_next(self) -> bool: + assert self._started + return self._output_queue.has_next() + + def _get_next_inner(self) -> RefBundle: + assert self._started + bundle = self._output_queue.get_next() + self._metrics.on_output_dequeued(bundle) + self._output_metadata.extend(bundle.metadata) + return bundle + + @abstractmethod + def progress_str(self) -> str: + raise NotImplementedError + + def _extra_metrics(self) -> Dict[str, Any]: + return {"ray_remote_args": dict(sorted(self._remote_args_for_metrics.items()))} + + def get_stats(self) -> StatsDict: + return {self._name: self._output_metadata} + + def get_map_transformer(self) -> MapTransformer: + return self._map_transformer + + def shutdown(self): + self._data_tasks.clear() + self._metadata_tasks.clear() + + @abstractmethod + def current_processor_usage(self) -> ExecutionResources: + raise NotImplementedError + + @abstractmethod + def pending_processor_usage(self) -> ExecutionResources: + raise NotImplementedError + + @abstractmethod + def base_resource_usage(self) -> ExecutionResources: + raise NotImplementedError + + @abstractmethod + def incremental_resource_usage(self) -> ExecutionResources: + raise NotImplementedError + + def implements_accurate_memory_accounting(self) -> bool: + return True + + def supports_fusion(self) -> bool: + return self._supports_fusion + + def num_active_tasks(self) -> int: + # Override `num_active_tasks` to only include data tasks and exclude + # metadata tasks, which are used by the actor-pool map operator to + # check if a newly created actor is ready. + # The reasons are because: + # 1. `PhysicalOperator.completed` checks `num_active_tasks`. The operator + # should be considered completed if there are still pending actors. + # 2. The number of active tasks in the progress bar will be more accurate + # to reflect the actual data processing tasks. + return len(self._data_tasks) + + +def _map_task( + map_transformer: MapTransformer, + data_context: DataContext, + ctx: TaskContext, + *blocks: Block, + **kwargs: Dict[str, Any], +) -> Iterator[Union[Block, List[BlockMetadata]]]: + """Remote function for a single operator task. + + Args: + fn: The callable that takes Iterator[Block] as input and returns + Iterator[Block] as output. + blocks: The concrete block values from the task ref bundle. + + Returns: + A generator of blocks, followed by the list of BlockMetadata for the blocks + as the last generator return. + """ + DataContext._set_current(data_context) + ctx.kwargs.update(kwargs) + stats = BlockExecStats.builder() + map_transformer.set_target_max_block_size(ctx.target_max_block_size) + for b_out in map_transformer.apply_transform(iter(blocks), ctx): + # TODO(Clark): Add input file propagation from input blocks. + m_out = BlockAccessor.for_block(b_out).get_metadata() + m_out.exec_stats = stats.build() + m_out.exec_stats.udf_time_s = map_transformer.udf_time() + m_out.exec_stats.task_idx = ctx.task_idx + yield b_out + yield m_out + stats = BlockExecStats.builder() + + +class _BlockRefBundler: + """Rebundles RefBundles to get them close to a particular number of rows.""" + + def __init__(self, min_rows_per_bundle: Optional[int]): + """Creates a BlockRefBundler. + + Args: + min_rows_per_bundle: The target number of rows per bundle. Note that we + bundle up to this target, but only exceed it if not doing so would + result in an empty bundle. + """ + self._min_rows_per_bundle = min_rows_per_bundle + self._bundle_buffer: List[RefBundle] = [] + self._bundle_buffer_size = 0 + self._finalized = False + + def add_bundle(self, bundle: RefBundle): + """Add a bundle to the bundler.""" + self._bundle_buffer.append(bundle) + self._bundle_buffer_size += self._get_bundle_size(bundle) + + def has_bundle(self) -> bool: + """Returns whether the bundler has a bundle.""" + return self._bundle_buffer and ( + self._min_rows_per_bundle is None + or self._bundle_buffer_size >= self._min_rows_per_bundle + or (self._finalized and self._bundle_buffer_size > 0) + ) + + def get_next_bundle(self) -> Tuple[List[RefBundle], RefBundle]: + """Gets the next bundle. + + Returns: + A two-tuple. The first element is a list of bundles that were combined into + the output bundle. The second element is the output bundle. + """ + assert self.has_bundle() + if self._min_rows_per_bundle is None: + # Short-circuit if no bundle row target was defined. + assert len(self._bundle_buffer) == 1 + bundle = self._bundle_buffer[0] + self._bundle_buffer = [] + self._bundle_buffer_size = 0 + return [bundle], bundle + leftover = [] + output_buffer = [] + output_buffer_size = 0 + buffer_filled = False + for bundle in self._bundle_buffer: + bundle_size = self._get_bundle_size(bundle) + if buffer_filled: + # Buffer has been filled, save it in the leftovers. + leftover.append(bundle) + elif ( + output_buffer_size + bundle_size <= self._min_rows_per_bundle + or output_buffer_size == 0 + ): + # Bundle fits in buffer, or bundle doesn't fit but the buffer still + # needs a non-empty bundle. + output_buffer.append(bundle) + output_buffer_size += bundle_size + else: + # Bundle doesn't fit in a buffer that already has at least one non-empty + # bundle, so we add it to the leftovers. + leftover.append(bundle) + # Add all remaining bundles to the leftovers. + buffer_filled = True + self._bundle_buffer = leftover + self._bundle_buffer_size = sum( + self._get_bundle_size(bundle) for bundle in leftover + ) + return list(output_buffer), _merge_ref_bundles(*output_buffer) + + def done_adding_bundles(self): + """Indicate that no more RefBundles will be added to this bundler.""" + self._finalized = True + + @staticmethod + def _get_bundle_size(bundle: RefBundle): + return bundle.num_rows() if bundle.num_rows() is not None else float("inf") + + +def _merge_ref_bundles(*bundles: RefBundle) -> RefBundle: + """Merge N ref bundles into a single bundle of multiple blocks.""" + # Check that at least one bundle is non-null. + assert any(bundle is not None for bundle in bundles) + blocks = list( + itertools.chain( + block for bundle in bundles if bundle is not None for block in bundle.blocks + ) + ) + owns_blocks = all(bundle.owns_blocks for bundle in bundles if bundle is not None) + return RefBundle(blocks, owns_blocks) + + +class _OutputQueue(ABC): + """Interface for swapping between different output order modes.""" + + @abstractmethod + def notify_task_output_ready(self, task_index: int, output: RefBundle): + """Called when a task's output is ready.""" + pass + + def notify_task_completed(self, task_index: int): + """Called when a previously pending task completes.""" + pass + + @abstractmethod + def has_next(self) -> bool: + pass + + @abstractmethod + def get_next(self) -> RefBundle: + pass + + +class _OrderedOutputQueue(_OutputQueue): + """An queue that returns finished tasks in submission order.""" + + def __init__(self): + self._task_outputs: Dict[int, Deque[RefBundle]] = defaultdict(lambda: deque()) + self._current_output_index: int = 0 + self._completed_tasks: Set[int] = set() + + def notify_task_output_ready(self, task_index: int, output: RefBundle): + self._task_outputs[task_index].append(output) + + def _move_to_next_task(self): + """Move the outut index to the next task. + + This method should only be called when the current task is complete and all + outputs have been taken. + """ + assert len(self._task_outputs[self._current_output_index]) == 0 + assert self._current_output_index in self._completed_tasks + del self._task_outputs[self._current_output_index] + self._completed_tasks.remove(self._current_output_index) + self._current_output_index += 1 + + def notify_task_completed(self, task_index: int): + assert task_index >= self._current_output_index + self._completed_tasks.add(task_index) + if task_index == self._current_output_index: + if len(self._task_outputs[task_index]) == 0: + self._move_to_next_task() + + def has_next(self) -> bool: + return len(self._task_outputs[self._current_output_index]) > 0 + + def get_next(self) -> RefBundle: + next_bundle = self._task_outputs[self._current_output_index].popleft() + if len(self._task_outputs[self._current_output_index]) == 0: + if self._current_output_index in self._completed_tasks: + self._move_to_next_task() + return next_bundle + + +class _UnorderedOutputQueue(_OutputQueue): + """An queue that does not guarantee output order of finished tasks.""" + + def __init__(self): + self._queue: Deque[RefBundle] = deque() + + def notify_task_output_ready(self, _: int, output: RefBundle): + self._queue.append(output) + + def has_next(self) -> bool: + return len(self._queue) > 0 + + def get_next(self) -> RefBundle: + return self._queue.popleft() + + +def _canonicalize_ray_remote_args(ray_remote_args: Dict[str, Any]) -> Dict[str, Any]: + """Enforce rules on ray remote args for map tasks. + + Namely, args must explicitly specify either CPU or GPU, not both. Disallowing + mixed resources avoids potential starvation and deadlock issues during scheduling, + and should not be a serious limitation for users. + """ + ray_remote_args = ray_remote_args.copy() + + if ray_remote_args.get("num_cpus") and ray_remote_args.get("num_gpus"): + logger.warning( + "Specifying both num_cpus and num_gpus for map tasks is experimental, " + "and may result in scheduling or stability issues. " + "Please report any issues to the Ray team: " + "https://github.com/ray-project/ray/issues/new/choose" + ) + + if "num_cpus" not in ray_remote_args and "num_gpus" not in ray_remote_args: + ray_remote_args["num_cpus"] = 1 + + return ray_remote_args diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d3135fce59a4785f0066544f3130750263205256 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py @@ -0,0 +1,460 @@ +import itertools +import time +from abc import abstractmethod +from enum import Enum +from typing import Any, Callable, Dict, Iterable, List, Optional, TypeVar, Union + +from ray.data._internal.block_batching.block_batching import batch_blocks +from ray.data._internal.execution.interfaces.task_context import TaskContext +from ray.data._internal.output_buffer import BlockOutputBuffer +from ray.data.block import Block, BlockAccessor, DataBatch + +# Allowed input/output data types for a MapTransformFn. +Row = Dict[str, Any] +MapTransformFnData = Union[Block, Row, DataBatch] + +# Function signature of a MapTransformFn. +IN = TypeVar("IN") +OUT = TypeVar("OUT") +MapTransformCallable = Callable[[Iterable[IN], TaskContext], Iterable[OUT]] + + +class MapTransformFnDataType(Enum): + """An enum that represents the input/output data type of a MapTransformFn.""" + + Block = 0 + Row = 1 + Batch = 2 + + +class MapTransformFn: + """Represents a single transform function in a MapTransformer.""" + + def __init__( + self, + input_type: MapTransformFnDataType, + output_type: MapTransformFnDataType, + is_udf: bool = False, + ): + """ + Args: + callable: the underlying Python callable object. + input_type: the type of the input data. + output_type: the type of the output data. + """ + self._callable = callable + self._input_type = input_type + self._output_type = output_type + self._target_max_block_size = None + self._is_udf = is_udf + + @abstractmethod + def __call__( + self, + input: Iterable[MapTransformFnData], + ctx: TaskContext, + ) -> Iterable[MapTransformFnData]: + ... + + @property + def input_type(self) -> MapTransformFnDataType: + return self._input_type + + @property + def output_type(self) -> MapTransformFnDataType: + return self._output_type + + def set_target_max_block_size(self, target_max_block_size: int): + self._target_max_block_size = target_max_block_size + + +class MapTransformer: + """Encapsulates the data transformation logic of a physical MapOperator. + + A MapTransformer may consist of one or more steps, each of which is represented + as a MapTransformFn. The first MapTransformFn must take blocks as input, and + the last MapTransformFn must output blocks. The intermediate data types can + be blocks, rows, or batches. + """ + + def __init__( + self, + transform_fns: List[MapTransformFn], + init_fn: Optional[Callable[[], None]] = None, + ): + """ + Args: + transform_fns: A list of `MapTransformFn`s that will be executed sequentially + to transform data. + init_fn: A function that will be called before transforming data. + Used for the actor-based map operator. + """ + self.set_transform_fns(transform_fns) + self._init_fn = init_fn if init_fn is not None else lambda: None + self._target_max_block_size = None + self._udf_time = 0 + + def set_transform_fns(self, transform_fns: List[MapTransformFn]) -> None: + """Set the transform functions.""" + assert len(transform_fns) > 0 + assert ( + transform_fns[0].input_type == MapTransformFnDataType.Block + ), "The first transform function must take blocks as input." + assert ( + transform_fns[-1].output_type == MapTransformFnDataType.Block + ), "The last transform function must output blocks." + + for i in range(len(transform_fns) - 1): + assert transform_fns[i].output_type == transform_fns[i + 1].input_type, ( + "The output type of the previous transform function must match " + "the input type of the next transform function." + ) + self._transform_fns = transform_fns + + def get_transform_fns(self) -> List[MapTransformFn]: + """Get the transform functions.""" + return self._transform_fns + + def set_target_max_block_size(self, target_max_block_size: int): + self._target_max_block_size = target_max_block_size + + def init(self) -> None: + """Initialize the transformer. + + Should be called before applying the transform. + """ + self._init_fn() + + def _udf_timed_iter( + self, input: Iterable[MapTransformFnData] + ) -> Iterable[MapTransformFnData]: + while True: + try: + start = time.perf_counter() + output = next(input) + self._udf_time += time.perf_counter() - start + yield output + except StopIteration: + break + + def apply_transform( + self, + input_blocks: Iterable[Block], + ctx: TaskContext, + ) -> Iterable[Block]: + """Apply the transform functions to the input blocks.""" + assert ( + self._target_max_block_size is not None + ), "target_max_block_size must be set before running" + for transform_fn in self._transform_fns: + transform_fn.set_target_max_block_size(self._target_max_block_size) + + iter = input_blocks + # Apply the transform functions sequentially to the input iterable. + for transform_fn in self._transform_fns: + iter = transform_fn(iter, ctx) + if transform_fn._is_udf: + iter = self._udf_timed_iter(iter) + return iter + + def fuse(self, other: "MapTransformer") -> "MapTransformer": + """Fuse two `MapTransformer`s together.""" + assert self._target_max_block_size == other._target_max_block_size or ( + self._target_max_block_size is None or other._target_max_block_size is None + ) + target_max_block_size = ( + self._target_max_block_size or other._target_max_block_size + ) + + # Define them as standalone variables to avoid fused_init_fn capturing the + # entire `MapTransformer` object. + self_init_fn = self._init_fn + other_init_fn = other._init_fn + + def fused_init_fn(): + self_init_fn() + other_init_fn() + + fused_transform_fns = self._transform_fns + other._transform_fns + transformer = MapTransformer(fused_transform_fns, init_fn=fused_init_fn) + transformer.set_target_max_block_size(target_max_block_size) + return transformer + + def udf_time(self) -> float: + return self._udf_time + + +def create_map_transformer_from_block_fn( + block_fn: MapTransformCallable[Block, Block], + init_fn: Optional[Callable[[], None]] = None, +): + """Create a MapTransformer from a single block-based transform function. + + This method should only be used for testing and legacy compatibility. + """ + return MapTransformer( + [ + BlockMapTransformFn(block_fn), + ], + init_fn, + ) + + +# Below are subclasses of MapTransformFn. + + +class RowMapTransformFn(MapTransformFn): + """A rows-to-rows MapTransformFn.""" + + def __init__(self, row_fn: MapTransformCallable[Row, Row], is_udf: bool = False): + self._row_fn = row_fn + super().__init__( + MapTransformFnDataType.Row, MapTransformFnDataType.Row, is_udf=is_udf + ) + + def __call__(self, input: Iterable[Row], ctx: TaskContext) -> Iterable[Row]: + yield from self._row_fn(input, ctx) + + def __repr__(self) -> str: + return f"RowMapTransformFn({self._row_fn})" + + +class BatchMapTransformFn(MapTransformFn): + """A batch-to-batch MapTransformFn.""" + + def __init__( + self, batch_fn: MapTransformCallable[DataBatch, DataBatch], is_udf: bool = False + ): + self._batch_fn = batch_fn + super().__init__( + MapTransformFnDataType.Batch, MapTransformFnDataType.Batch, is_udf=is_udf + ) + + def __call__( + self, input: Iterable[DataBatch], ctx: TaskContext + ) -> Iterable[DataBatch]: + yield from self._batch_fn(input, ctx) + + def __repr__(self) -> str: + return f"BatchMapTransformFn({self._batch_fn})" + + +class BlockMapTransformFn(MapTransformFn): + """A block-to-block MapTransformFn.""" + + def __init__(self, block_fn: MapTransformCallable[Block, Block]): + self._block_fn = block_fn + super().__init__( + MapTransformFnDataType.Block, + MapTransformFnDataType.Block, + ) + + def __call__(self, input: Iterable[Block], ctx: TaskContext) -> Iterable[Block]: + yield from self._block_fn(input, ctx) + + def __repr__(self) -> str: + return f"BlockMapTransformFn({self._block_fn})" + + +class BlocksToRowsMapTransformFn(MapTransformFn): + """A MapTransformFn that converts input blocks to rows.""" + + def __init__(self): + super().__init__( + MapTransformFnDataType.Block, + MapTransformFnDataType.Row, + ) + + def __call__(self, blocks: Iterable[Block], _: TaskContext) -> Iterable[Row]: + for block in blocks: + block = BlockAccessor.for_block(block) + for row in block.iter_rows(public_row_format=True): + yield row + + @classmethod + def instance(cls) -> "BlocksToRowsMapTransformFn": + """Returns the singleton instance.""" + if getattr(cls, "_instance", None) is None: + cls._instance = cls() + return cls._instance + + def __repr__(self) -> str: + return "BlocksToRowsMapTransformFn()" + + +class BlocksToBatchesMapTransformFn(MapTransformFn): + """A MapTransformFn that converts input blocks to batches.""" + + def __init__( + self, + batch_size: Optional[int] = None, + batch_format: str = "default", + zero_copy_batch: bool = False, + ): + self._batch_size = batch_size + self._batch_format = batch_format + self._ensure_copy = not zero_copy_batch and batch_size is not None + super().__init__( + MapTransformFnDataType.Block, + MapTransformFnDataType.Batch, + ) + + def __call__( + self, + blocks: Iterable[Block], + _: TaskContext, + ) -> Iterable[DataBatch]: + """Converts input blocks to batches.""" + block_iter = iter(blocks) + first = next(block_iter, None) + if first is None: + return [] + blocks = itertools.chain([first], block_iter) + empty_block = BlockAccessor.for_block(first).builder().build() + # Don't hold the first block in memory, so we reset the reference. + first = None + + # Ensure that zero-copy batch views are copied so mutating UDFs don't error. + formatted_batch_iter = batch_blocks( + blocks=blocks, + stats=None, + batch_size=self._batch_size, + batch_format=self._batch_format, + ensure_copy=self._ensure_copy, + ) + + first = next(formatted_batch_iter, None) + if first is None: + # If the input blocks are all empty, then yield an empty block with same + # format as the input blocks. + return [empty_block] + else: + return itertools.chain([first], formatted_batch_iter) + + @property + def batch_size(self) -> Optional[int]: + return self._batch_size + + @property + def batch_format(self) -> str: + return self._batch_format + + @property + def zero_copy_batch(self) -> bool: + return not self._ensure_copy + + def __repr__(self) -> str: + return ( + f"BlocksToBatchesMapTransformFn(" + f"batch_size={self._batch_size}, " + f"batch_format={self._batch_format}, " + f"zero_copy_batch={self.zero_copy_batch}" + f")" + ) + + +class BuildOutputBlocksMapTransformFn(MapTransformFn): + """A MapTransformFn that converts UDF-returned data to output blocks.""" + + def __init__(self, input_type: MapTransformFnDataType): + """ + Args: + input_type: the type of input data. + """ + self._input_type = input_type + super().__init__( + input_type, + MapTransformFnDataType.Block, + ) + + def __call__( + self, + iter: Iterable[MapTransformFnData], + _: TaskContext, + ) -> Iterable[Block]: + """Convert UDF-returned data to output blocks. + + Args: + iter: the iterable of UDF-returned data, whose type + must match self._input_type. + """ + assert ( + self._target_max_block_size is not None + ), "target_max_block_size must be set before running" + output_buffer = BlockOutputBuffer(self._target_max_block_size) + if self._input_type == MapTransformFnDataType.Block: + add_fn = output_buffer.add_block + elif self._input_type == MapTransformFnDataType.Batch: + add_fn = output_buffer.add_batch + else: + assert self._input_type == MapTransformFnDataType.Row + add_fn = output_buffer.add + for data in iter: + add_fn(data) + while output_buffer.has_next(): + yield output_buffer.next() + output_buffer.finalize() + while output_buffer.has_next(): + yield output_buffer.next() + + @classmethod + def for_rows(cls) -> "BuildOutputBlocksMapTransformFn": + """Return a BuildOutputBlocksMapTransformFn for row input.""" + return cls(MapTransformFnDataType.Row) + + @classmethod + def for_batches(cls) -> "BuildOutputBlocksMapTransformFn": + """Return a BuildOutputBlocksMapTransformFn for batch input.""" + return cls(MapTransformFnDataType.Batch) + + @classmethod + def for_blocks(cls) -> "BuildOutputBlocksMapTransformFn": + """Return a BuildOutputBlocksMapTransformFn for block input.""" + return cls(MapTransformFnDataType.Block) + + def __repr__(self) -> str: + return f"BuildOutputBlocksMapTransformFn(input_type={self._input_type})" + + +def _splitrange(n, k): + """Calculates array lens of np.array_split(). + + This is the equivalent of + `[len(x) for x in np.array_split(range(n), k)]`. + """ + base = n // k + output = [base] * k + rem = n - sum(output) + for i in range(len(output)): + if rem > 0: + output[i] += 1 + rem -= 1 + assert rem == 0, (rem, output, n, k) + assert sum(output) == n, (output, n, k) + return output + + +class ApplyAdditionalSplitToOutputBlocks(MapTransformFn): + """Do additional splits on output blocks.""" + + def __init__(self, additional_split_factor: int): + """ + Args: + additional_output_splits: The number of additional splits, must be + greater than 1. + """ + assert additional_split_factor > 1 + self._additional_split_factor = additional_split_factor + super().__init__(MapTransformFnDataType.Block, MapTransformFnDataType.Block) + + def __call__(self, blocks: Iterable[Block], ctx: TaskContext) -> Iterable[Block]: + for block in blocks: + block = BlockAccessor.for_block(block) + offset = 0 + split_sizes = _splitrange(block.num_rows(), self._additional_split_factor) + for size in split_sizes: + # NOTE: copy=True is needed because this is an output block. If + # a block slice is put into the object store, the entire block + # will get serialized. + yield block.slice(offset, offset + size, copy=True) + offset += size diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/output_splitter.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/output_splitter.py new file mode 100644 index 0000000000000000000000000000000000000000..d571c92a4d164fc380c58b06937d411952394b7c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/output_splitter.py @@ -0,0 +1,330 @@ +import math +import time +from collections import deque +from typing import Any, Dict, List, Optional, Tuple + +from ray.data._internal.execution.interfaces import ( + ExecutionOptions, + NodeIdStr, + PhysicalOperator, + RefBundle, +) +from ray.data._internal.execution.util import locality_string +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.stats import StatsDict +from ray.data.block import Block, BlockAccessor, BlockMetadata +from ray.data.context import DataContext +from ray.types import ObjectRef + + +class OutputSplitter(PhysicalOperator): + """An operator that splits the given data into `n` output splits. + + The output bundles of this operator will have a `bundle.output_split_idx` attr + set to an integer from [0..n-1]. This operator tries to divide the rows evenly + across output splits. If the `equal` option is set, the operator will furthermore + guarantee an exact split of rows across outputs, truncating the Dataset. + + Implementation wise, this operator keeps an internal buffer of bundles. The buffer + has a minimum size calculated to enable a good locality hit rate, as well as ensure + we can satisfy the `equal` requirement. + + OutputSplitter does not provide any ordering guarantees. + """ + + def __init__( + self, + input_op: PhysicalOperator, + n: int, + equal: bool, + data_context: DataContext, + locality_hints: Optional[List[NodeIdStr]] = None, + ): + super().__init__( + f"split({n}, equal={equal})", + [input_op], + data_context, + target_max_block_size=None, + ) + self._equal = equal + # Buffer of bundles not yet assigned to output splits. + self._buffer: List[RefBundle] = [] + # The outputted bundles with output_split attribute set. + self._output_queue: deque[RefBundle] = deque() + # The number of rows output to each output split so far. + self._num_output: List[int] = [0 for _ in range(n)] + # The time of the overhead for the output splitter (operator level) + self._output_splitter_overhead_time = 0 + + if locality_hints is not None: + if n != len(locality_hints): + raise ValueError( + "Locality hints list must have length `n`: " + f"len({locality_hints}) != {n}" + ) + self._locality_hints = locality_hints + if locality_hints: + # To optimize locality, we should buffer a certain number of elements + # internally before dispatch to allow the locality algorithm a good chance + # of selecting a preferred location. We use a small multiple of `n` since + # it's reasonable to buffer a couple blocks per consumer. + self._min_buffer_size = 2 * n + else: + self._min_buffer_size = 0 + self._locality_hits = 0 + self._locality_misses = 0 + + def num_outputs_total(self) -> Optional[int]: + # OutputSplitter does not change the number of blocks, + # so we can return the number of blocks from the input op. + return self.input_dependencies[0].num_outputs_total() + + def num_output_rows_total(self) -> Optional[int]: + # The total number of rows is the same as the number of input rows. + return self.input_dependencies[0].num_output_rows_total() + + def start(self, options: ExecutionOptions) -> None: + super().start(options) + # Force disable locality optimization. + if not options.actor_locality_enabled: + self._locality_hints = None + self._min_buffer_size = 0 + + def throttling_disabled(self) -> bool: + """Disables resource-based throttling. + + It doesn't make sense to throttle the inputs to this operator, since all that + would do is lower the buffer size and prevent us from emitting outputs / + reduce the locality hit rate. + """ + return True + + def has_next(self) -> bool: + return len(self._output_queue) > 0 + + def _get_next_inner(self) -> RefBundle: + output = self._output_queue.popleft() + self._metrics.on_output_dequeued(output) + return output + + def get_stats(self) -> StatsDict: + return {"split": []} # TODO(ekl) add split metrics? + + def _extra_metrics(self) -> Dict[str, Any]: + stats = {} + for i, num in enumerate(self._num_output): + stats[f"num_output_{i}"] = num + stats["output_splitter_overhead_time"] = self._output_splitter_overhead_time + return stats + + def _add_input_inner(self, bundle, input_index) -> None: + if bundle.num_rows() is None: + raise ValueError("OutputSplitter requires bundles with known row count") + self._buffer.append(bundle) + self._metrics.on_input_queued(bundle) + self._dispatch_bundles() + + def all_inputs_done(self) -> None: + super().all_inputs_done() + if not self._equal: + self._dispatch_bundles(dispatch_all=True) + assert not self._buffer, "Should have dispatched all bundles." + return + + # Otherwise: + # Need to finalize distribution of buffered data to output splits. + buffer_size = sum(b.num_rows() for b in self._buffer) + max_n = max(self._num_output) + + # First calculate the min rows to add per output to equalize them. + allocation = [max_n - n for n in self._num_output] + remainder = buffer_size - sum(allocation) + # Invariant: buffer should always be large enough to equalize. + assert remainder >= 0, (remainder, buffer_size, allocation) + + # Equally distribute remaining rows in buffer to outputs. + x = remainder // len(allocation) + allocation = [a + x for a in allocation] + + # Execute the split. + for i, count in enumerate(allocation): + bundles = self._split_from_buffer(count) + for b in bundles: + b.output_split_idx = i + self._output_queue.append(b) + self._metrics.on_output_queued(b) + self._buffer = [] + + def internal_queue_size(self) -> int: + return len(self._buffer) + + def progress_str(self) -> str: + if self._locality_hints: + return locality_string(self._locality_hits, self._locality_misses) + else: + return "[locality disabled]" + + def _dispatch_bundles(self, dispatch_all: bool = False) -> None: + start_time = time.perf_counter() + # Dispatch all dispatchable bundles from the internal buffer. + # This may not dispatch all bundles when equal=True. + while self._buffer and ( + dispatch_all or len(self._buffer) >= self._min_buffer_size + ): + target_index = self._select_output_index() + target_bundle = self._pop_bundle_to_dispatch(target_index) + if self._can_safely_dispatch(target_index, target_bundle.num_rows()): + target_bundle.output_split_idx = target_index + self._num_output[target_index] += target_bundle.num_rows() + self._output_queue.append(target_bundle) + self._metrics.on_output_queued(target_bundle) + if self._locality_hints: + preferred_loc = self._locality_hints[target_index] + if self._get_location(target_bundle) == preferred_loc: + self._locality_hits += 1 + else: + self._locality_misses += 1 + else: + # Put it back and abort. + self._buffer.insert(0, target_bundle) + self._metrics.on_input_queued(target_bundle) + break + self._output_splitter_overhead_time += time.perf_counter() - start_time + + def _select_output_index(self) -> int: + # Greedily dispatch to the consumer with the least data so far. + i, _ = min(enumerate(self._num_output), key=lambda t: t[1]) + return i + + def _pop_bundle_to_dispatch(self, target_index: int) -> RefBundle: + if self._locality_hints: + preferred_loc = self._locality_hints[target_index] + for bundle in self._buffer: + if self._get_location(bundle) == preferred_loc: + self._buffer.remove(bundle) + self._metrics.on_input_dequeued(bundle) + return bundle + + bundle = self._buffer.pop(0) + self._metrics.on_input_dequeued(bundle) + return bundle + + def _can_safely_dispatch(self, target_index: int, nrow: int) -> bool: + if not self._equal: + # If not in equals mode, dispatch away with no buffer requirements. + return True + output_distribution = self._num_output.copy() + output_distribution[target_index] += nrow + buffer_requirement = self._calculate_buffer_requirement(output_distribution) + buffer_size = sum(b.num_rows() for b in self._buffer) + return buffer_size >= buffer_requirement + + def _calculate_buffer_requirement(self, output_distribution: List[int]) -> int: + # Calculate the new number of rows that we'd need to equalize the row + # distribution after the bundle dispatch. + max_n = max(output_distribution) + return sum([max_n - n for n in output_distribution]) + + def _split_from_buffer(self, nrow: int) -> List[RefBundle]: + output = [] + acc = 0 + while acc < nrow: + b = self._buffer.pop() + self._metrics.on_input_dequeued(b) + if acc + b.num_rows() <= nrow: + output.append(b) + acc += b.num_rows() + else: + left, right = _split(b, nrow - acc) + output.append(left) + acc += left.num_rows() + self._buffer.append(right) + self._metrics.on_input_queued(right) + assert acc == nrow, (acc, nrow) + + assert sum(b.num_rows() for b in output) == nrow, (acc, nrow) + return output + + def _get_location(self, bundle: RefBundle) -> Optional[NodeIdStr]: + """Ask Ray for the node id of the given bundle. + + This method may be overriden for testing. + + Returns: + A node id associated with the bundle, or None if unknown. + """ + return bundle.get_cached_location() + + def implements_accurate_memory_accounting(self) -> bool: + return True + + +def _split(bundle: RefBundle, left_size: int) -> Tuple[RefBundle, RefBundle]: + left_blocks, left_meta = [], [] + right_blocks, right_meta = [], [] + acc = 0 + for b, m in bundle.blocks: + if acc >= left_size: + right_blocks.append(b) + right_meta.append(m) + elif acc + m.num_rows <= left_size: + left_blocks.append(b) + left_meta.append(m) + acc += m.num_rows + else: + # Trouble case: split it up. + lm, rm = _split_meta(m, left_size - acc) + lb, rb = _split_block(b, left_size - acc) + left_meta.append(lm) + right_meta.append(rm) + left_blocks.append(lb) + right_blocks.append(rb) + acc += lm.num_rows + assert acc == left_size + left = RefBundle(list(zip(left_blocks, left_meta)), owns_blocks=bundle.owns_blocks) + right = RefBundle( + list(zip(right_blocks, right_meta)), owns_blocks=bundle.owns_blocks + ) + assert left.num_rows() == left_size + assert left.num_rows() + right.num_rows() == bundle.num_rows() + return left, right + + +def _split_meta( + m: BlockMetadata, left_size: int +) -> Tuple[BlockMetadata, BlockMetadata]: + left_bytes = int(math.floor(m.size_bytes * (left_size / m.num_rows))) + left = BlockMetadata( + num_rows=left_size, + size_bytes=left_bytes, + schema=m.schema, + input_files=m.input_files, + exec_stats=None, + ) + right = BlockMetadata( + num_rows=m.num_rows - left_size, + size_bytes=m.size_bytes - left_bytes, + schema=m.schema, + input_files=m.input_files, + exec_stats=None, + ) + return left, right + + +def _split_block( + b: ObjectRef[Block], left_size: int +) -> Tuple[ObjectRef[Block], ObjectRef[Block]]: + split_single_block = cached_remote_fn(_split_single_block) + left, right = split_single_block.options(num_cpus=0, num_returns=2).remote( + b, left_size + ) + return left, right + + +def _split_single_block(b: Block, left_size: int) -> Tuple[Block, Block]: + acc = BlockAccessor.for_block(b) + left = acc.slice(0, left_size) + right = acc.slice(left_size, acc.num_rows()) + assert BlockAccessor.for_block(left).num_rows() == left_size + assert BlockAccessor.for_block(right).num_rows() == (acc.num_rows() - left_size) + return left, right diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/task_pool_map_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/task_pool_map_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..b4377d3eb365a1636502fefa23f81639fdb0eb7c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/task_pool_map_operator.py @@ -0,0 +1,146 @@ +from typing import Any, Callable, Dict, Optional + +import ray +from ray.data._internal.execution.interfaces import ( + ExecutionResources, + PhysicalOperator, + RefBundle, + TaskContext, +) +from ray.data._internal.execution.operators.map_operator import MapOperator, _map_task +from ray.data._internal.execution.operators.map_transformer import MapTransformer +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data.context import DataContext + + +class TaskPoolMapOperator(MapOperator): + """A MapOperator implementation that executes tasks on a task pool.""" + + def __init__( + self, + map_transformer: MapTransformer, + input_op: PhysicalOperator, + data_context: DataContext, + target_max_block_size: Optional[int], + name: str = "TaskPoolMap", + min_rows_per_bundle: Optional[int] = None, + concurrency: Optional[int] = None, + supports_fusion: bool = True, + ray_remote_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, + ray_remote_args: Optional[Dict[str, Any]] = None, + ): + """Create an TaskPoolMapOperator instance. + + Args: + transform_fn: The function to apply to each ref bundle input. + input_op: Operator generating input data for this op. + name: The name of this operator. + target_max_block_size: The target maximum number of bytes to + include in an output block. + min_rows_per_bundle: The number of rows to gather per batch passed to the + transform_fn, or None to use the block size. Setting the batch size is + important for the performance of GPU-accelerated transform functions. + The actual rows passed may be less if the dataset is small. + concurrency: The maximum number of Ray tasks to use concurrently, + or None to use as many tasks as possible. + supports_fusion: Whether this operator supports fusion with other operators. + ray_remote_args_fn: A function that returns a dictionary of remote args + passed to each map worker. The purpose of this argument is to generate + dynamic arguments for each actor/task, and will be called each time + prior to initializing the worker. Args returned from this dict will + always override the args in ``ray_remote_args``. Note: this is an + advanced, experimental feature. + ray_remote_args: Customize the :func:`ray.remote` args for this op's tasks. + """ + super().__init__( + map_transformer, + input_op, + data_context, + name, + target_max_block_size, + min_rows_per_bundle, + supports_fusion, + ray_remote_args_fn, + ray_remote_args, + ) + self._concurrency = concurrency + + # NOTE: Unlike static Ray remote args, dynamic arguments extracted from the + # blocks themselves are going to be passed inside `fn.options(...)` + # invocation + ray_remote_static_args = { + **(self._ray_remote_args or {}), + "num_returns": "streaming", + } + + self._map_task = cached_remote_fn(_map_task, **ray_remote_static_args) + + def _add_bundled_input(self, bundle: RefBundle): + # Submit the task as a normal Ray task. + ctx = TaskContext( + task_idx=self._next_data_task_idx, + target_max_block_size=self.actual_target_max_block_size, + ) + + dynamic_ray_remote_args = self._get_runtime_ray_remote_args(input_bundle=bundle) + dynamic_ray_remote_args["name"] = self.name + + data_context = self.data_context + if data_context._max_num_blocks_in_streaming_gen_buffer is not None: + # The `_generator_backpressure_num_objects` parameter should be + # `2 * _max_num_blocks_in_streaming_gen_buffer` because we yield + # 2 objects for each block: the block and the block metadata. + dynamic_ray_remote_args["_generator_backpressure_num_objects"] = ( + 2 * data_context._max_num_blocks_in_streaming_gen_buffer + ) + + gen = self._map_task.options(**dynamic_ray_remote_args).remote( + self._map_transformer_ref, + data_context, + ctx, + *bundle.block_refs, + **self.get_map_task_kwargs(), + ) + self._submit_data_task(gen, bundle) + + def shutdown(self): + # Cancel all active tasks. + for _, task in self._data_tasks.items(): + ray.cancel(task.get_waitable()) + # Wait until all tasks have failed or been cancelled. + for _, task in self._data_tasks.items(): + try: + ray.get(task.get_waitable()) + except ray.exceptions.RayError: + # Cancellation either succeeded, or the task had already failed with + # a different error, or cancellation failed. In all cases, we + # swallow the exception. + pass + super().shutdown() + + def progress_str(self) -> str: + return "" + + def base_resource_usage(self) -> ExecutionResources: + return ExecutionResources() + + def current_processor_usage(self) -> ExecutionResources: + num_active_workers = self.num_active_tasks() + return ExecutionResources( + cpu=self._ray_remote_args.get("num_cpus", 0) * num_active_workers, + gpu=self._ray_remote_args.get("num_gpus", 0) * num_active_workers, + ) + + def pending_processor_usage(self) -> ExecutionResources: + return ExecutionResources() + + def incremental_resource_usage(self) -> ExecutionResources: + return ExecutionResources( + cpu=self._ray_remote_args.get("num_cpus", 0), + gpu=self._ray_remote_args.get("num_gpus", 0), + object_store_memory=self._metrics.obj_store_mem_max_pending_output_per_task + or 0, + ) + + def get_concurrency(self) -> Optional[int]: + return self._concurrency diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/union_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/union_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..0dda603287dedd85ba1b22dcaa8a4211d8c90954 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/union_operator.py @@ -0,0 +1,116 @@ +from typing import List, Optional + +from ray.data._internal.execution.interfaces import ( + ExecutionOptions, + PhysicalOperator, + RefBundle, +) +from ray.data._internal.execution.operators.base_physical_operator import NAryOperator +from ray.data._internal.stats import StatsDict +from ray.data.context import DataContext + + +class UnionOperator(NAryOperator): + """An operator that combines output blocks from + two or more input operators into a single output.""" + + def __init__( + self, + data_context: DataContext, + *input_ops: PhysicalOperator, + ): + """Create a UnionOperator. + + Args: + input_ops: Operators generating input data for this operator to union. + """ + + # By default, union does not preserve the order of output blocks. + # To preserve the order, configure ExecutionOptions accordingly. + self._preserve_order = False + + # Intermediary buffers used to store blocks from each input dependency. + # Only used when `self._prserve_order` is True. + self._input_buffers: List[List[RefBundle]] = [[] for _ in range(len(input_ops))] + + # The index of the input dependency that is currently the source of + # the output buffer. New inputs from this input dependency will be added + # directly to the output buffer. Only used when `self._preserve_order` is True. + self._input_idx_to_output = 0 + + self._output_buffer: List[RefBundle] = [] + self._stats: StatsDict = {"Union": []} + super().__init__(data_context, *input_ops) + + def start(self, options: ExecutionOptions): + # Whether to preserve the order of the input data (both the + # order of the input operators and the order of the blocks within). + self._preserve_order = options.preserve_order + super().start(options) + + def num_outputs_total(self) -> Optional[int]: + num_outputs = 0 + for input_op in self.input_dependencies: + input_num_outputs = input_op.num_outputs_total() + if input_num_outputs is None: + return None + num_outputs += input_num_outputs + return num_outputs + + def num_output_rows_total(self) -> Optional[int]: + total_rows = 0 + for input_op in self.input_dependencies: + input_num_rows = input_op.num_output_rows_total() + if input_num_rows is None: + return None + total_rows += input_num_rows + return total_rows + + def _add_input_inner(self, refs: RefBundle, input_index: int) -> None: + assert not self.completed() + assert 0 <= input_index <= len(self._input_dependencies), input_index + + if not self._preserve_order: + self._output_buffer.append(refs) + else: + if input_index == self._input_idx_to_output: + self._output_buffer.append(refs) + else: + self._input_buffers[input_index].append(refs) + + def input_done(self, input_index: int) -> None: + """When `self._preserve_order` is True, change the + output buffer source to the next input dependency + once the current input dependency calls `input_done()`.""" + if not self._preserve_order: + return + if not input_index == self._input_idx_to_output: + return + next_input_idx = self._input_idx_to_output + 1 + if next_input_idx < len(self._input_buffers): + self._output_buffer.extend(self._input_buffers[next_input_idx]) + self._input_buffers[next_input_idx].clear() + self._input_idx_to_output = next_input_idx + super().input_done(input_index) + + def all_inputs_done(self) -> None: + # Note that in the case where order is not preserved, all inputs + # are directly added to the output buffer as soon as they are received, + # so there is no need to check any intermediary buffers. + if self._preserve_order: + for idx, input_buffer in enumerate(self._input_buffers): + assert len(input_buffer) == 0, ( + f"Input at index {idx} still has " + f"{len(input_buffer)} blocks remaining." + ) + super().all_inputs_done() + + def has_next(self) -> bool: + # Check if the output buffer still contains at least one block. + return len(self._output_buffer) > 0 + + def _get_next_inner(self) -> RefBundle: + return self._output_buffer.pop(0) + + def get_stats(self) -> StatsDict: + return self._stats diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/zip_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/zip_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..552639ef97ccb10b7e1d64507de5dda6ca5e592d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/zip_operator.py @@ -0,0 +1,263 @@ +import itertools +from typing import List, Optional, Tuple + +import ray +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.execution.interfaces import PhysicalOperator, RefBundle +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.split import _split_at_indices +from ray.data._internal.stats import StatsDict +from ray.data.block import ( + Block, + BlockAccessor, + BlockExecStats, + BlockMetadata, + BlockPartition, +) +from ray.data.context import DataContext + + +class ZipOperator(PhysicalOperator): + """An operator that zips its inputs together. + + NOTE: the implementation is bulk for now, which materializes all its inputs in + object store, before starting execution. Should re-implement it as a streaming + operator in the future. + """ + + def __init__( + self, + left_input_op: PhysicalOperator, + right_input_op: PhysicalOperator, + data_context: DataContext, + ): + """Create a ZipOperator. + + Args: + left_input_ops: The input operator at left hand side. + right_input_op: The input operator at right hand side. + """ + self._left_buffer: List[RefBundle] = [] + self._right_buffer: List[RefBundle] = [] + self._output_buffer: List[RefBundle] = [] + self._stats: StatsDict = {} + super().__init__( + "Zip", + [left_input_op, right_input_op], + data_context, + target_max_block_size=None, + ) + + def num_outputs_total(self) -> Optional[int]: + left_num_outputs = self.input_dependencies[0].num_outputs_total() + right_num_outputs = self.input_dependencies[1].num_outputs_total() + if left_num_outputs is not None and right_num_outputs is not None: + return max(left_num_outputs, right_num_outputs) + elif left_num_outputs is not None: + return left_num_outputs + else: + return right_num_outputs + + def num_output_rows_total(self) -> Optional[int]: + left_num_rows = self.input_dependencies[0].num_output_rows_total() + right_num_rows = self.input_dependencies[1].num_output_rows_total() + if left_num_rows is not None and right_num_rows is not None: + return max(left_num_rows, right_num_rows) + elif left_num_rows is not None: + return left_num_rows + else: + return right_num_rows + + def _add_input_inner(self, refs: RefBundle, input_index: int) -> None: + assert not self.completed() + assert input_index == 0 or input_index == 1, input_index + if input_index == 0: + self._left_buffer.append(refs) + else: + self._right_buffer.append(refs) + + def all_inputs_done(self) -> None: + self._output_buffer, self._stats = self._zip( + self._left_buffer, self._right_buffer + ) + self._left_buffer.clear() + self._right_buffer.clear() + super().all_inputs_done() + + def has_next(self) -> bool: + return len(self._output_buffer) > 0 + + def _get_next_inner(self) -> RefBundle: + return self._output_buffer.pop(0) + + def get_stats(self) -> StatsDict: + return self._stats + + def _zip( + self, left_input: List[RefBundle], right_input: List[RefBundle] + ) -> Tuple[List[RefBundle], StatsDict]: + """Zip the RefBundles from `left_input` and `right_input` together. + + Zip is done in 2 steps: aligning blocks, and zipping blocks from + both sides. + + Aligning blocks (optional): check the blocks from `left_input` and + `right_input` are aligned or not, i.e. if having different number of blocks, or + having different number of rows in some blocks. If not aligned, repartition the + smaller input with `_split_at_indices` to align with larger input. + + Zipping blocks: after blocks from both sides are aligned, zip + blocks from both sides together in parallel. + """ + left_blocks_with_metadata = [] + for bundle in left_input: + for block, meta in bundle.blocks: + left_blocks_with_metadata.append((block, meta)) + right_blocks_with_metadata = [] + for bundle in right_input: + for block, meta in bundle.blocks: + right_blocks_with_metadata.append((block, meta)) + + left_block_rows, left_block_bytes = self._calculate_blocks_rows_and_bytes( + left_blocks_with_metadata + ) + right_block_rows, right_block_bytes = self._calculate_blocks_rows_and_bytes( + right_blocks_with_metadata + ) + + # Check that both sides have the same number of rows. + # TODO(Clark): Support different number of rows via user-directed + # dropping/padding. + total_left_rows = sum(left_block_rows) + total_right_rows = sum(right_block_rows) + if total_left_rows != total_right_rows: + raise ValueError( + "Cannot zip datasets of different number of rows: " + f"{total_left_rows}, {total_right_rows}" + ) + + # Whether the left and right input sides are inverted + input_side_inverted = False + if sum(right_block_bytes) > sum(left_block_bytes): + # Make sure that right side is smaller, so we minimize splitting + # work when aligning both sides. + # TODO(Clark): Improve this heuristic for minimizing splitting work, + # e.g. by generating the splitting plans for each route (via + # _generate_per_block_split_indices) and choosing the plan that splits + # the least cumulative bytes. + left_blocks_with_metadata, right_blocks_with_metadata = ( + right_blocks_with_metadata, + left_blocks_with_metadata, + ) + left_block_rows, right_block_rows = right_block_rows, left_block_rows + input_side_inverted = True + + # Get the split indices that will align both sides. + indices = list(itertools.accumulate(left_block_rows)) + indices.pop(-1) + + # Split other at the alignment indices, such that for every block from + # left side, we have a list of blocks from right side that have the same + # cumulative number of rows as that left block. + # NOTE: _split_at_indices has a no-op fastpath if the blocks are already + # aligned. + aligned_right_blocks_with_metadata = _split_at_indices( + right_blocks_with_metadata, + indices, + block_rows=right_block_rows, + ) + del right_blocks_with_metadata + + left_blocks = [b for b, _ in left_blocks_with_metadata] + right_blocks_list = aligned_right_blocks_with_metadata[0] + del left_blocks_with_metadata, aligned_right_blocks_with_metadata + + zip_one_block = cached_remote_fn(_zip_one_block, num_returns=2) + + output_blocks = [] + output_metadata = [] + for left_block, right_blocks in zip(left_blocks, right_blocks_list): + # For each block from left side, zip it together with 1 or more blocks from + # right side. We're guaranteed to have that left_block has the same number + # of rows as right_blocks has cumulatively. + res, meta = zip_one_block.remote( + left_block, *right_blocks, inverted=input_side_inverted + ) + output_blocks.append(res) + output_metadata.append(meta) + + # Early release memory. + del left_blocks, right_blocks_list + + # TODO(ekl) it might be nice to have a progress bar here. + output_metadata = ray.get(output_metadata) + output_refs = [] + input_owned = all(b.owns_blocks for b in left_input) + for block, meta in zip(output_blocks, output_metadata): + output_refs.append( + RefBundle( + [ + ( + block, + meta, + ) + ], + owns_blocks=input_owned, + ) + ) + stats = {self._name: output_metadata} + + # Clean up inputs. + for ref in left_input: + ref.destroy_if_owned() + for ref in right_input: + ref.destroy_if_owned() + + return output_refs, stats + + def _calculate_blocks_rows_and_bytes( + self, + blocks_with_metadata: BlockPartition, + ) -> Tuple[List[int], List[int]]: + """Calculate the number of rows and size in bytes for a list of blocks with + metadata. + """ + get_num_rows_and_bytes = cached_remote_fn(_get_num_rows_and_bytes) + block_rows = [] + block_bytes = [] + for block, metadata in blocks_with_metadata: + if metadata.num_rows is None or metadata.size_bytes is None: + # Need to fetch number of rows or size in bytes, so just fetch both. + num_rows, size_bytes = ray.get(get_num_rows_and_bytes.remote(block)) + # Cache on the block metadata. + metadata.num_rows = num_rows + metadata.size_bytes = size_bytes + block_rows.append(metadata.num_rows) + block_bytes.append(metadata.size_bytes) + return block_rows, block_bytes + + +def _zip_one_block( + block: Block, *other_blocks: Block, inverted: bool = False +) -> Tuple[Block, BlockMetadata]: + """Zip together `block` with `other_blocks`.""" + stats = BlockExecStats.builder() + # Concatenate other blocks. + # TODO(Clark): Extend BlockAccessor.zip() to work with N other blocks, + # so we don't need to do this concatenation. + builder = DelegatingBlockBuilder() + for other_block in other_blocks: + builder.add_block(other_block) + other_block = builder.build() + if inverted: + # Swap blocks if ordering was inverted during block alignment splitting. + block, other_block = other_block, block + # Zip block and other blocks. + result = BlockAccessor.for_block(block).zip(other_block) + br = BlockAccessor.for_block(result) + return result, br.get_metadata(exec_stats=stats.build()) + + +def _get_num_rows_and_bytes(block: Block) -> Tuple[int, int]: + block = BlockAccessor.for_block(block) + return block.num_rows(), block.size_bytes() diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/resource_manager.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/resource_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..914869380ec6c6fbec0d2bae8b52cf6b9b307dd4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/resource_manager.py @@ -0,0 +1,651 @@ +import logging +import os +import time +from abc import ABC, abstractmethod +from collections import defaultdict +from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional + +from ray.data._internal.execution.interfaces.execution_options import ( + ExecutionOptions, + ExecutionResources, +) +from ray.data._internal.execution.interfaces.physical_operator import PhysicalOperator +from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer +from ray.data._internal.execution.util import memory_string +from ray.data.context import DataContext + +if TYPE_CHECKING: + from ray.data._internal.execution.streaming_executor_state import Topology + + +logger = logging.getLogger(__name__) +DEBUG_RESOURCE_MANAGER = os.environ.get("RAY_DATA_DEBUG_RESOURCE_MANAGER", "0") == "1" + + +class ResourceManager: + """A class that manages the resource usage of a streaming executor.""" + + # The interval in seconds at which the global resource limits are refreshed. + GLOBAL_LIMITS_UPDATE_INTERVAL_S = 10 + + # The fraction of the object store capacity that will be used as the default object + # store memory limit for the streaming executor, + # when `ReservationOpResourceAllocator` is enabled. + DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION = 0.5 + + # The fraction of the object store capacity that will be used as the default object + # store memory limit for the streaming executor, + # when `ReservationOpResourceAllocator` is not enabled. + DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION_NO_RESERVATION = 0.25 + + def __init__( + self, + topology: "Topology", + options: ExecutionOptions, + get_total_resources: Callable[[], ExecutionResources], + data_context: DataContext, + ): + self._topology = topology + self._options = options + self._get_total_resources = get_total_resources + self._global_limits = ExecutionResources.zero() + self._global_limits_last_update_time = 0 + self._global_usage = ExecutionResources.zero() + self._global_running_usage = ExecutionResources.zero() + self._global_pending_usage = ExecutionResources.zero() + self._op_usages: Dict[PhysicalOperator, ExecutionResources] = {} + self._op_running_usages: Dict[PhysicalOperator, ExecutionResources] = {} + self._op_pending_usages: Dict[PhysicalOperator, ExecutionResources] = {} + # Object store memory usage internal to the operator, including the + # pending task outputs and op's internal output buffers. + self._mem_op_internal: Dict[PhysicalOperator, int] = defaultdict(int) + # Object store memory usage of the blocks that have been taken out of + # the operator, including the external output buffer in OpState, and the + # input buffers of the downstream operators. + self._mem_op_outputs: Dict[PhysicalOperator, int] = defaultdict(int) + # Whether to print debug information. + self._debug = DEBUG_RESOURCE_MANAGER + + self._downstream_fraction: Dict[PhysicalOperator, float] = {} + self._downstream_object_store_memory: Dict[PhysicalOperator, float] = {} + + self._op_resource_allocator: Optional["OpResourceAllocator"] = None + + if data_context.op_resource_reservation_enabled: + # We'll enable memory reservation if all operators have + # implemented accurate memory accounting. + should_enable = all( + op.implements_accurate_memory_accounting() for op in topology + ) + if should_enable: + self._op_resource_allocator = ReservationOpResourceAllocator( + self, data_context.op_resource_reservation_ratio + ) + + self._object_store_memory_limit_fraction = ( + data_context.override_object_store_memory_limit_fraction + if data_context.override_object_store_memory_limit_fraction is not None + else ( + self.DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION + if self.op_resource_allocator_enabled() + else self.DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION_NO_RESERVATION + ) + ) + + def _estimate_object_store_memory(self, op, state) -> int: + # Don't count input refs towards dynamic memory usage, as they have been + # pre-created already outside this execution. + if isinstance(op, InputDataBuffer): + return 0 + + # Pending task outputs. + mem_op_internal = op.metrics.obj_store_mem_pending_task_outputs or 0 + # Op's internal output buffers. + mem_op_internal += op.metrics.obj_store_mem_internal_outqueue + + # Op's external output buffer. + mem_op_outputs = state.outqueue_memory_usage() + # Input buffers of the downstream operators. + for next_op in op.output_dependencies: + mem_op_outputs += ( + next_op.metrics.obj_store_mem_internal_inqueue + + next_op.metrics.obj_store_mem_pending_task_inputs + ) + + self._mem_op_internal[op] = mem_op_internal + self._mem_op_outputs[op] = mem_op_outputs + + return mem_op_internal + mem_op_outputs + + def update_usages(self): + """Recalculate resource usages.""" + # TODO(hchen): This method will be called frequently during the execution loop. + # And some computations are redundant. We should either remove redundant + # computations or remove this method entirely and compute usages on demand. + self._global_usage = ExecutionResources(0, 0, 0) + self._global_running_usage = ExecutionResources(0, 0, 0) + self._global_pending_usage = ExecutionResources(0, 0, 0) + self._op_usages.clear() + self._op_running_usages.clear() + self._op_pending_usages.clear() + self._downstream_fraction.clear() + self._downstream_object_store_memory.clear() + + # Iterate from last to first operator. + num_ops_so_far = 0 + num_ops_total = len(self._topology) + for op, state in reversed(self._topology.items()): + # Update `self._op_usages`, `self._op_running_usages`, + # and `self._op_pending_usages`. + op.update_resource_usage() + op_usage = op.current_processor_usage() + op_running_usage = op.running_processor_usage() + op_pending_usage = op.pending_processor_usage() + + assert not op_usage.object_store_memory + assert not op_running_usage.object_store_memory + assert not op_pending_usage.object_store_memory + op_usage.object_store_memory = self._estimate_object_store_memory(op, state) + op_running_usage.object_store_memory = self._estimate_object_store_memory( + op, state + ) + self._op_usages[op] = op_usage + self._op_running_usages[op] = op_running_usage + self._op_pending_usages[op] = op_pending_usage + + # Update `self._global_usage`, `self._global_running_usage`, + # and `self._global_pending_usage`. + self._global_usage = self._global_usage.add(op_usage) + self._global_running_usage = self._global_running_usage.add( + op_running_usage + ) + self._global_pending_usage = self._global_pending_usage.add( + op_pending_usage + ) + + # Update `self._downstream_fraction` and `_downstream_object_store_memory`. + # Subtract one from denom to account for input buffer. + f = (1.0 + num_ops_so_far) / max(1.0, num_ops_total - 1.0) + num_ops_so_far += 1 + self._downstream_fraction[op] = min(1.0, f) + self._downstream_object_store_memory[ + op + ] = self._global_usage.object_store_memory + + # Update operator's object store usage, which is used by + # DatasetStats and updated on the Ray Data dashboard. + op._metrics.obj_store_mem_used = op_usage.object_store_memory + + if self._op_resource_allocator is not None: + self._op_resource_allocator.update_usages() + + def get_global_usage(self) -> ExecutionResources: + """Return the global resource usage at the current time.""" + return self._global_usage + + def get_global_running_usage(self) -> ExecutionResources: + """Return the global running resource usage at the current time.""" + return self._global_running_usage + + def get_global_pending_usage(self) -> ExecutionResources: + """Return the global pending resource usage at the current time.""" + return self._global_pending_usage + + def get_global_limits(self) -> ExecutionResources: + """Return the global resource limits at the current time. + + This method autodetects any unspecified execution resource limits based on the + current cluster size, refreshing these values periodically to support cluster + autoscaling. + """ + if ( + time.time() - self._global_limits_last_update_time + < self.GLOBAL_LIMITS_UPDATE_INTERVAL_S + ): + return self._global_limits + + self._global_limits_last_update_time = time.time() + default_limits = self._options.resource_limits + exclude = self._options.exclude_resources + total_resources = self._get_total_resources() + default_mem_fraction = self._object_store_memory_limit_fraction + total_resources.object_store_memory *= default_mem_fraction + self._global_limits = default_limits.min(total_resources).subtract(exclude) + return self._global_limits + + def get_op_usage(self, op: PhysicalOperator) -> ExecutionResources: + """Return the resource usage of the given operator at the current time.""" + return self._op_usages[op] + + def get_op_usage_str(self, op: PhysicalOperator) -> str: + """Return a human-readable string representation of the resource usage of + the given operator.""" + usage_str = f"{self._op_running_usages[op].cpu:.1f} CPU" + if self._op_running_usages[op].gpu: + usage_str += f", {self._op_running_usages[op].gpu:.1f} GPU" + usage_str += ( + f", {self._op_running_usages[op].object_store_memory_str()} object store" + ) + if self._debug: + usage_str += ( + f" (in={memory_string(self._mem_op_internal[op])}," + f"out={memory_string(self._mem_op_outputs[op])})" + ) + if ( + isinstance(self._op_resource_allocator, ReservationOpResourceAllocator) + and op in self._op_resource_allocator._op_budgets + ): + budget = self._op_resource_allocator._op_budgets[op] + usage_str += f", budget=(cpu={budget.cpu:.1f}" + usage_str += f",gpu={budget.gpu:.1f}" + usage_str += f",object store={budget.object_store_memory_str()})" + return usage_str + + def get_downstream_fraction(self, op: PhysicalOperator) -> float: + """Return the downstream fraction of the given operator.""" + return self._downstream_fraction[op] + + def get_downstream_object_store_memory(self, op: PhysicalOperator) -> float: + """Return the downstream object store memory usage of the given operator.""" + return self._downstream_object_store_memory[op] + + def op_resource_allocator_enabled(self) -> bool: + """Return whether OpResourceAllocator is enabled.""" + return self._op_resource_allocator is not None + + @property + def op_resource_allocator(self) -> "OpResourceAllocator": + """Return the OpResourceAllocator.""" + assert self._op_resource_allocator is not None + return self._op_resource_allocator + + +class OpResourceAllocator(ABC): + """An interface for dynamic operator resource allocation. + + This interface allows dynamically allocating available resources to each operator, + limiting how many tasks each operator can submit, and how much data each operator + can read from its running tasks. + """ + + def __init__(self, resource_manager: ResourceManager): + self._resource_manager = resource_manager + + @abstractmethod + def update_usages(self): + """Callback to update resource usages.""" + ... + + @abstractmethod + def can_submit_new_task(self, op: PhysicalOperator) -> bool: + """Return whether the given operator can submit a new task.""" + ... + + @abstractmethod + def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]: + """Return the maximum bytes of pending task outputs can be read for + the given operator. None means no limit.""" + ... + + @abstractmethod + def get_budget(self, op: PhysicalOperator) -> ExecutionResources: + """Return the budget for the given operator.""" + ... + + +class ReservationOpResourceAllocator(OpResourceAllocator): + """An OpResourceAllocator implementation that reserves resources for each operator. + + This class reserves memory and CPU resources for eligible operators, and considers + runtime resource usages to limit the resources that each operator can use. + + It works in the following way: + 1. An operator is eligible for resource reservation, if it has enabled throttling + and hasn't completed. Ineligible operators are not throttled, but + their usage will be accounted for their upstream eligible operators. E.g., for + such a dataset "map1->limit->map2->streaming_split", we'll treat "map1->limit" as + a group and "map2->streaming_split" as another group. + 2. For each eligible operator, we reserve `reservation_ratio * global_resources / + num_eligible_ops` resources, half of which is reserved only for the operator + outputs, excluding pending task outputs. + 3. Non-reserved resources are shared among all operators. + 4. In each scheduling iteration, each eligible operator will get "remaining of their + own reserved resources" + "remaining of shared resources / num_eligible_ops" + resources. + + The `reservation_ratio` is set to 50% by default. Users can tune this value to + adjust how aggressive or conservative the resource allocation is. A higher value + will make the resource allocation more even, but may lead to underutilization and + worse performance. And vice versa. + """ + + class IdleDetector: + """Utility class for detecting idle operators. + + Note, stalling can happen when there are less resources than Data executor + expects. E.g., when some resources are preempted by non-Data code, see + `test_no_deadlock_on_resource_contention` as an example. + + This class is used to detect potential stalling and allow the execution + to make progress. + """ + + # The interval to detect idle operators. + # When downstream is idle, we'll allow reading at least one task output + # per this interval, + DETECTION_INTERVAL_S = 10.0 + # Print a warning if an operator is idle for this time. + WARN_ON_IDLE_TIME_S = 60.0 + # Whether a warning has been printed. + _warn_printed = False + + def __init__(self): + # per-op fields + self.last_num_outputs = defaultdict(int) + self.last_output_time = defaultdict(lambda: time.time()) + self.last_detection_time = defaultdict(lambda: time.time()) + + def detect_idle(self, op: PhysicalOperator): + cur_time = time.time() + if cur_time - self.last_detection_time[op] > self.DETECTION_INTERVAL_S: + cur_num_outputs = op.metrics.num_task_outputs_generated + if cur_num_outputs > self.last_num_outputs[op]: + self.last_num_outputs[op] = cur_num_outputs + self.last_output_time[op] = cur_time + self.last_detection_time[op] = cur_time + else: + self.last_detection_time[op] = cur_time + self.print_warning_if_idle_for_too_long( + op, cur_time - self.last_output_time[op] + ) + return True + return False + + @classmethod + def print_warning_if_idle_for_too_long( + cls, op: PhysicalOperator, idle_time: float + ): + """Print a warning if an operator is idle for too long.""" + if idle_time < cls.WARN_ON_IDLE_TIME_S or cls._warn_printed: + return + cls._warn_printed = True + msg = ( + f"Operator {op} is running but has no outputs for {idle_time} seconds." + " Execution may be slower than expected.\n" + "Ignore this warning if your UDF is expected to be slow." + " Otherwise, this can happen when there are fewer cluster resources" + " available to Ray Data than expected." + " If you have non-Data tasks or actors running in the cluster, exclude" + " their resources from Ray Data with" + " `DataContext.get_current().execution_options.exclude_resources`." + " This message will only print once." + ) + logger.warning(msg) + + def __init__(self, resource_manager: ResourceManager, reservation_ratio: float): + super().__init__(resource_manager) + self._reservation_ratio = reservation_ratio + assert 0.0 <= self._reservation_ratio <= 1.0 + # Per-op reserved resources, excluding `_reserved_for_op_outputs`. + self._op_reserved: Dict[PhysicalOperator, ExecutionResources] = {} + # Memory reserved exclusively for the outputs of each operator. + # "Op outputs" refer to blocks that have been taken out of an operator, + # i.e., `RessourceManager._mem_op_outputs`. + # + # Note, if we don't reserve memory for op outputs, all the budget may be used by + # the pending task outputs, and/or op's internal output buffers (the latter can + # happen when `preserve_order=True`). + # Then we'll have no budget to pull blocks from the op. + self._reserved_for_op_outputs: Dict[PhysicalOperator, float] = {} + # Total shared resources. + self._total_shared = ExecutionResources.zero() + # Resource budgets for each operator, excluding `_reserved_for_op_outputs`. + self._op_budgets: Dict[PhysicalOperator, ExecutionResources] = {} + # Whether each operator has reserved the minimum resources to run + # at least one task. + # This is used to avoid edge cases where the entire resource limits are not + # enough to run one task of each op. + # See `test_no_deadlock_on_small_cluster_resources` as an example. + self._reserved_min_resources: Dict[PhysicalOperator, bool] = {} + + self._cached_global_limits = ExecutionResources.zero() + self._cached_num_eligible_ops = 0 + + self._idle_detector = self.IdleDetector() + + def _is_op_eligible(self, op: PhysicalOperator) -> bool: + """Whether the op is eligible for memory reservation.""" + return not op.throttling_disabled() and not op.completed() + + def _get_eligible_ops(self) -> List[PhysicalOperator]: + return [ + op for op in self._resource_manager._topology if self._is_op_eligible(op) + ] + + def _update_reservation(self): + global_limits = self._resource_manager.get_global_limits() + eligible_ops = self._get_eligible_ops() + + if ( + global_limits == self._cached_global_limits + and len(eligible_ops) == self._cached_num_eligible_ops + ): + return + self._cached_global_limits = global_limits + self._cached_num_eligible_ops = len(eligible_ops) + + self._op_reserved.clear() + self._reserved_for_op_outputs.clear() + self._reserved_min_resources.clear() + self._total_shared = global_limits.copy() + + if len(eligible_ops) == 0: + return + + # Reserve `reservation_ratio * global_limits / num_ops` resources for each + # operator. + default_reserved = global_limits.scale( + self._reservation_ratio / (len(eligible_ops)) + ) + for op in eligible_ops: + # Reserve at least half of the default reserved resources for the outputs. + # This makes sure that we will have enough budget to pull blocks from the + # op. + self._reserved_for_op_outputs[op] = max( + default_reserved.object_store_memory / 2, 1.0 + ) + # Calculate the minimum amount of resources to reserve. + # 1. Make sure the reserved resources are at least to allow one task. + min_reserved = op.incremental_resource_usage().copy() + # 2. To ensure that all GPUs are utilized, reserve enough resource budget + # to launch one task for each worker. + if op.base_resource_usage().gpu > 0: + min_workers = sum( + pool.min_size() for pool in op.get_autoscaling_actor_pools() + ) + min_reserved.object_store_memory *= min_workers + # Also include `reserved_for_op_outputs`. + min_reserved.object_store_memory += self._reserved_for_op_outputs[op] + # Total resources we want to reserve for this operator. + op_total_reserved = default_reserved.max(min_reserved) + if op_total_reserved.satisfies_limit(self._total_shared): + # If the remaining resources are enough to reserve `op_total_reserved`, + # subtract it from `self._total_shared` and reserve it for this op. + self._reserved_min_resources[op] = True + self._total_shared = self._total_shared.subtract(op_total_reserved) + self._op_reserved[op] = op_total_reserved + self._op_reserved[ + op + ].object_store_memory -= self._reserved_for_op_outputs[op] + else: + # If the remaining resources are not enough to reserve the minimum + # resources for this operator, we'll only reserve the minimum object + # store memory, but not the CPU and GPU resources. + # Because Ray Core doesn't allow CPU/GPU resources to be oversubscribed. + # Note, we reserve minimum resources first for the upstream + # ops. Downstream ops need to wait for upstream ops to finish + # and release resources. + self._reserved_min_resources[op] = False + self._op_reserved[op] = ExecutionResources( + 0, + 0, + min_reserved.object_store_memory + - self._reserved_for_op_outputs[op], + ) + self._total_shared = self._total_shared.subtract( + ExecutionResources(0, 0, min_reserved.object_store_memory) + ) + + self._total_shared = self._total_shared.max(ExecutionResources.zero()) + + def can_submit_new_task(self, op: PhysicalOperator) -> bool: + if op not in self._op_budgets: + return True + budget = self._op_budgets[op] + res = op.incremental_resource_usage().satisfies_limit(budget) + return res + + def get_budget(self, op: PhysicalOperator) -> ExecutionResources: + return self._op_budgets[op] + + def _should_unblock_streaming_output_backpressure( + self, op: PhysicalOperator + ) -> bool: + # In some edge cases, the downstream operators may have no enough resources to + # launch tasks. Then we should temporarily unblock the streaming output + # backpressure by allowing reading at least 1 block. So the current operator + # can finish at least one task and yield resources to the downstream operators. + for next_op in self._get_downstream_eligible_ops(op): + if not self._reserved_min_resources[next_op]: + # Case 1: the downstream operator hasn't reserved the minimum resources + # to run at least one task. + return True + # Case 2: the downstream operator has reserved the minimum resources, but + # the resources are preempted by non-Data tasks or actors. + # We don't have a good way to detect this case, so we'll unblock + # backpressure when the downstream operator has been idle for a while. + if self._idle_detector.detect_idle(next_op): + return True + return False + + def _get_op_outputs_usage_with_downstream(self, op: PhysicalOperator) -> float: + """Get the outputs memory usage of the given operator, including the downstream + ineligible operators. + """ + # Outputs usage of the current operator. + op_outputs_usage = self._resource_manager._mem_op_outputs[op] + # Also account the downstream ineligible operators' memory usage. + op_outputs_usage += sum( + self._resource_manager.get_op_usage(next_op).object_store_memory + for next_op in self._get_downstream_ineligible_ops(op) + ) + return op_outputs_usage + + def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]: + if op not in self._op_budgets: + return None + res = self._op_budgets[op].object_store_memory + # Add the remaining of `_reserved_for_op_outputs`. + op_outputs_usage = self._get_op_outputs_usage_with_downstream(op) + res += max(self._reserved_for_op_outputs[op] - op_outputs_usage, 0) + res = int(res) + assert res >= 0 + if res == 0 and self._should_unblock_streaming_output_backpressure(op): + res = 1 + return res + + def _get_downstream_ineligible_ops( + self, op: PhysicalOperator + ) -> Iterable[PhysicalOperator]: + """Get the downstream ineligible operators of the given operator. + + E.g., + - "cur_map->downstream_map" will return an empty list. + - "cur_map->limit1->limit2->downstream_map" will return [limit1, limit2]. + """ + for next_op in op.output_dependencies: + if not self._is_op_eligible(next_op): + yield next_op + yield from self._get_downstream_ineligible_ops(next_op) + + def _get_downstream_eligible_ops( + self, op: PhysicalOperator + ) -> Iterable[PhysicalOperator]: + """Get the downstream eligible operators of the given operator, ignoring + intermediate ineligible operators. + + E.g., + - "cur_map->downstream_map" will return [downstream_map]. + - "cur_map->limit1->limit2->downstream_map" will return [downstream_map]. + """ + for next_op in op.output_dependencies: + if self._is_op_eligible(next_op): + yield next_op + else: + yield from self._get_downstream_eligible_ops(next_op) + + def update_usages(self): + self._update_reservation() + + self._op_budgets.clear() + eligible_ops = self._get_eligible_ops() + if len(eligible_ops) == 0: + return + + # Remaining of shared resources. + remaining_shared = self._total_shared + for op in eligible_ops: + # Calculate the memory usage of the operator. + op_mem_usage = 0 + # Add the memory usage of the operator itself, + # excluding `_reserved_for_op_outputs`. + op_mem_usage += self._resource_manager._mem_op_internal[op] + # Add the portion of op outputs usage that has + # exceeded `_reserved_for_op_outputs`. + op_outputs_usage = self._get_op_outputs_usage_with_downstream(op) + op_mem_usage += max(op_outputs_usage - self._reserved_for_op_outputs[op], 0) + op_usage = self._resource_manager.get_op_usage(op).copy() + op_usage.object_store_memory = op_mem_usage + op_reserved = self._op_reserved[op] + # How much of the reserved resources are remaining. + op_reserved_remaining = op_reserved.subtract(op_usage).max( + ExecutionResources.zero() + ) + self._op_budgets[op] = op_reserved_remaining + # How much of the reserved resources are exceeded. + # If exceeded, we need to subtract from the remaining shared resources. + op_reserved_exceeded = op_usage.subtract(op_reserved).max( + ExecutionResources.zero() + ) + remaining_shared = remaining_shared.subtract(op_reserved_exceeded) + + remaining_shared = remaining_shared.max(ExecutionResources.zero()) + + # Allocate the remaining shared resources to each operator. + for i, op in enumerate(reversed(eligible_ops)): + # By default, divide the remaining shared resources equally. + op_shared = remaining_shared.scale(1.0 / (len(eligible_ops) - i)) + # But if the op's budget is less than `incremental_resource_usage`, + # it will be useless. So we'll let the downstream operator + # borrow some resources from the upstream operator, if remaining_shared + # is still enough. + to_borrow = ( + op.incremental_resource_usage() + .subtract(self._op_budgets[op].add(op_shared)) + .max(ExecutionResources.zero()) + ) + if not to_borrow.is_zero() and op_shared.add(to_borrow).satisfies_limit( + remaining_shared + ): + op_shared = op_shared.add(to_borrow) + remaining_shared = remaining_shared.subtract(op_shared) + assert remaining_shared.is_non_negative(), ( + remaining_shared, + op, + op_shared, + to_borrow, + ) + self._op_budgets[op] = self._op_budgets[op].add(op_shared) + # We don't limit GPU resources, as not all operators + # use GPU resources. + self._op_budgets[op].gpu = float("inf") diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..9342bebc098c86c72c9116a3f3d8d17ea4fb7ef3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor.py @@ -0,0 +1,502 @@ +import logging +import threading +import time +import uuid +from typing import Dict, Iterator, List, Optional + +from ray.data._internal.execution.autoscaler import create_autoscaler +from ray.data._internal.execution.backpressure_policy import ( + BackpressurePolicy, + get_backpressure_policies, +) +from ray.data._internal.execution.execution_callback import get_execution_callbacks +from ray.data._internal.execution.interfaces import ( + ExecutionResources, + Executor, + OutputIterator, + PhysicalOperator, + RefBundle, +) +from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer +from ray.data._internal.execution.resource_manager import ResourceManager +from ray.data._internal.execution.streaming_executor_state import ( + OpState, + Topology, + build_streaming_topology, + process_completed_tasks, + select_operator_to_run, + update_operator_states, +) +from ray.data._internal.logging import get_log_directory +from ray.data._internal.progress_bar import ProgressBar +from ray.data._internal.stats import DatasetStats, StatsManager +from ray.data.context import OK_PREFIX, WARN_PREFIX, DataContext + +logger = logging.getLogger(__name__) + +# Force a progress bar update after this many events processed . This avoids the +# progress bar seeming to stall for very large scale workloads. +PROGRESS_BAR_UPDATE_INTERVAL = 50 + +# Interval for logging execution progress updates and operator metrics. +DEBUG_LOG_INTERVAL_SECONDS = 5 + +# Visible for testing. +_num_shutdown = 0 + + +class StreamingExecutor(Executor, threading.Thread): + """A streaming Dataset executor. + + This implementation executes Dataset DAGs in a fully streamed way. It runs + by setting up the operator topology, and then routing blocks through operators in + a way that maximizes throughput under resource constraints. + """ + + def __init__(self, data_context: DataContext, dataset_tag: str = "unknown_dataset"): + self._data_context = data_context + self._start_time: Optional[float] = None + self._initial_stats: Optional[DatasetStats] = None + self._final_stats: Optional[DatasetStats] = None + self._global_info: Optional[ProgressBar] = None + + self._execution_id = uuid.uuid4().hex + + # The executor can be shutdown while still running. + self._shutdown_lock = threading.RLock() + self._execution_started = False + self._shutdown = False + + # Internal execution state shared across thread boundaries. We run the control + # loop on a separate thread so that it doesn't become stalled between + # generator `yield`s. + self._topology: Optional[Topology] = None + self._output_node: Optional[OpState] = None + self._backpressure_policies: List[BackpressurePolicy] = [] + + self._dataset_tag = dataset_tag + # Stores if an operator is completed, + # used for marking when an op has just completed. + self._has_op_completed: Optional[Dict[PhysicalOperator, bool]] = None + self._max_errored_blocks = self._data_context.max_errored_blocks + self._num_errored_blocks = 0 + + self._last_debug_log_time = 0 + + Executor.__init__(self, self._data_context.execution_options) + thread_name = f"StreamingExecutor-{self._execution_id}" + threading.Thread.__init__(self, daemon=True, name=thread_name) + + def execute( + self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None + ) -> Iterator[RefBundle]: + """Executes the DAG using a streaming execution strategy. + + We take an event-loop approach to scheduling. We block on the next scheduling + event using `ray.wait`, updating operator state and dispatching new tasks. + """ + + self._initial_stats = initial_stats + self._start_time = time.perf_counter() + + if not isinstance(dag, InputDataBuffer): + if self._data_context.print_on_execution_start: + message = "Starting execution of Dataset." + log_path = get_log_directory() + if log_path is not None: + message += f" Full logs are in {log_path}" + logger.info(message) + logger.info(f"Execution plan of Dataset: {dag}") + + logger.debug("Execution config: %s", self._options) + + # Note: DAG must be initialized in order to query num_outputs_total. + # Note: Initialize global progress bar before building the streaming + # topology so bars are created in the same order as they should be + # displayed. This is done to ensure correct ordering within notebooks. + # TODO(zhilong): Implement num_output_rows_total for all + # AllToAllOperators + self._global_info = ProgressBar( + "Running", dag.num_output_rows_total(), unit="row" + ) + + # Setup the streaming DAG topology and start the runner thread. + self._topology, _ = build_streaming_topology(dag, self._options) + self._resource_manager = ResourceManager( + self._topology, + self._options, + lambda: self._autoscaler.get_total_resources(), + self._data_context, + ) + self._backpressure_policies = get_backpressure_policies(self._topology) + self._autoscaler = create_autoscaler( + self._topology, + self._resource_manager, + self._execution_id, + ) + + self._has_op_completed = {op: False for op in self._topology} + + self._output_node: OpState = self._topology[dag] + StatsManager.register_dataset_to_stats_actor( + self._dataset_tag, + self._get_operator_tags(), + ) + for callback in get_execution_callbacks(self._data_context): + callback.before_execution_starts() + + self.start() + self._execution_started = True + + class StreamIterator(OutputIterator): + def __init__(self, outer: Executor): + self._outer = outer + + def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle: + try: + item = self._outer._output_node.get_output_blocking( + output_split_idx + ) + if self._outer._global_info: + self._outer._global_info.update( + item.num_rows(), dag.num_output_rows_total() + ) + return item + # Needs to be BaseException to catch KeyboardInterrupt. Otherwise we + # can leave dangling progress bars by skipping shutdown. + except BaseException as e: + self._outer.shutdown(isinstance(e, StopIteration)) + raise + + def __del__(self): + self._outer.shutdown() + + return StreamIterator(self) + + def __del__(self): + self.shutdown() + + def shutdown(self, execution_completed: bool = True): + global _num_shutdown + + with self._shutdown_lock: + if not self._execution_started or self._shutdown: + return + logger.debug(f"Shutting down {self}.") + _num_shutdown += 1 + self._shutdown = True + # Give the scheduling loop some time to finish processing. + self.join(timeout=2.0) + self._update_stats_metrics( + state="FINISHED" if execution_completed else "FAILED", + force_update=True, + ) + # Once Dataset execution completes, mark it as complete + # and remove last cached execution stats. + StatsManager.clear_last_execution_stats(self._dataset_tag) + # Freeze the stats and save it. + self._final_stats = self._generate_stats() + stats_summary_string = self._final_stats.to_summary().to_string( + include_parent=False + ) + if self._data_context.enable_auto_log_stats: + logger.info(stats_summary_string) + # Close the progress bars from top to bottom to avoid them jumping + # around in the console after completion. + if self._global_info: + # Set the appropriate description that summarizes + # the result of dataset execution. + if execution_completed: + prog_bar_msg = ( + f"{OK_PREFIX} Dataset execution finished in " + f"{self._final_stats.time_total_s:.2f} seconds" + ) + else: + prog_bar_msg = f"{WARN_PREFIX} Dataset execution failed" + self._global_info.set_description(prog_bar_msg) + self._global_info.close() + for op, state in self._topology.items(): + op.shutdown() + state.close_progress_bars() + self._autoscaler.on_executor_shutdown() + + def run(self): + """Run the control loop in a helper thread. + + Results are returned via the output node's outqueue. + """ + try: + # Run scheduling loop until complete. + while True: + t_start = time.process_time() + # use process_time to avoid timing ray.wait in _scheduling_loop_step + continue_sched = self._scheduling_loop_step(self._topology) + if self._initial_stats: + self._initial_stats.streaming_exec_schedule_s.add( + time.process_time() - t_start + ) + if not continue_sched or self._shutdown: + break + for callback in get_execution_callbacks(self._data_context): + callback.after_execution_succeeds() + except Exception as e: + # Propagate it to the result iterator. + self._output_node.mark_finished(e) + for callback in get_execution_callbacks(self._data_context): + callback.after_execution_fails(e) + finally: + # Signal end of results. + self._output_node.mark_finished() + + def get_stats(self): + """Return the stats object for the streaming execution. + + The stats object will be updated as streaming execution progresses. + """ + if self._final_stats: + return self._final_stats + else: + return self._generate_stats() + + def _generate_stats(self) -> DatasetStats: + """Create a new stats object reflecting execution status so far.""" + stats = self._initial_stats or DatasetStats(metadata={}, parent=None) + for op in self._topology: + if isinstance(op, InputDataBuffer): + continue + builder = stats.child_builder(op.name, override_start_time=self._start_time) + stats = builder.build_multioperator(op.get_stats()) + stats.extra_metrics = op.metrics.as_dict() + stats.streaming_exec_schedule_s = ( + self._initial_stats.streaming_exec_schedule_s + if self._initial_stats + else None + ) + return stats + + def _scheduling_loop_step(self, topology: Topology) -> bool: + """Run one step of the scheduling loop. + + This runs a few general phases: + 1. Waiting for the next task completion using `ray.wait()`. + 2. Pulling completed refs into operator outqueues. + 3. Selecting and dispatching new inputs to operators. + + Returns: + True if we should continue running the scheduling loop. + """ + self._resource_manager.update_usages() + # Note: calling process_completed_tasks() is expensive since it incurs + # ray.wait() overhead, so make sure to allow multiple dispatch per call for + # greater parallelism. + num_errored_blocks = process_completed_tasks( + topology, + self._resource_manager, + self._max_errored_blocks, + ) + if self._max_errored_blocks > 0: + self._max_errored_blocks -= num_errored_blocks + self._num_errored_blocks += num_errored_blocks + + self._resource_manager.update_usages() + # Dispatch as many operators as we can for completed tasks. + self._report_current_usage() + op = select_operator_to_run( + topology, + self._resource_manager, + self._backpressure_policies, + self._autoscaler, + ensure_at_least_one_running=self._consumer_idling(), + ) + + i = 0 + while op is not None: + i += 1 + if i % PROGRESS_BAR_UPDATE_INTERVAL == 0: + self._refresh_progress_bars(topology) + topology[op].dispatch_next_task() + self._resource_manager.update_usages() + op = select_operator_to_run( + topology, + self._resource_manager, + self._backpressure_policies, + self._autoscaler, + ensure_at_least_one_running=self._consumer_idling(), + ) + + update_operator_states(topology) + self._refresh_progress_bars(topology) + + self._update_stats_metrics(state="RUNNING") + if time.time() - self._last_debug_log_time >= DEBUG_LOG_INTERVAL_SECONDS: + _log_op_metrics(topology) + _debug_dump_topology(topology, self._resource_manager) + self._last_debug_log_time = time.time() + + # Log metrics of newly completed operators. + for op in topology: + if op.completed() and not self._has_op_completed[op]: + log_str = ( + f"Operator {op} completed. " + f"Operator Metrics:\n{op._metrics.as_dict()}" + ) + logger.debug(log_str) + self._has_op_completed[op] = True + + # Keep going until all operators run to completion. + return not all(op.completed() for op in topology) + + def _refresh_progress_bars(self, topology: Topology): + # Update the progress bar to reflect scheduling decisions. + for op_state in topology.values(): + op_state.refresh_progress_bar(self._resource_manager) + # Refresh the global progress bar to update elapsed time progress. + if self._global_info: + self._global_info.refresh() + + def _consumer_idling(self) -> bool: + """Returns whether the user thread is blocked on topology execution.""" + return len(self._output_node.outqueue) == 0 + + def _report_current_usage(self) -> None: + # running_usage is the amount of resources that have been requested but + # not necessarily available + # TODO(sofian) https://github.com/ray-project/ray/issues/47520 + # We need to split the reported resources into running, pending-scheduling, + # pending-node-assignment. + running_usage = self._resource_manager.get_global_running_usage() + pending_usage = self._resource_manager.get_global_pending_usage() + limits = self._resource_manager.get_global_limits() + resources_status = ( + # TODO(scottjlee): Add dataset name/ID to progress bar output. + "Running Dataset. Active & requested resources: " + f"{running_usage.cpu:.4g}/{limits.cpu:.4g} CPU, " + ) + if running_usage.gpu > 0: + resources_status += f"{running_usage.gpu:.4g}/{limits.gpu:.4g} GPU, " + resources_status += ( + f"{running_usage.object_store_memory_str()}/" + f"{limits.object_store_memory_str()} object store" + ) + + # Only include pending section when there are pending resources. + if pending_usage.cpu or pending_usage.gpu: + if pending_usage.cpu and pending_usage.gpu: + pending_str = ( + f"{pending_usage.cpu:.4g} CPU, {pending_usage.gpu:.4g} GPU" + ) + elif pending_usage.cpu: + pending_str = f"{pending_usage.cpu:.4g} CPU" + else: + pending_str = f"{pending_usage.gpu:.4g} GPU" + resources_status += f" (pending: {pending_str})" + if self._global_info: + self._global_info.set_description(resources_status) + + def _get_operator_tags(self): + """Returns a list of operator tags.""" + return [f"{op.name}{i}" for i, op in enumerate(self._topology)] + + def _get_state_dict(self, state): + last_op, last_state = list(self._topology.items())[-1] + return { + "state": state, + "progress": last_state.num_completed_tasks, + "total": last_op.num_outputs_total(), + "end_time": time.time() if state != "RUNNING" else None, + "operators": { + f"{op.name}{i}": { + "name": op.name, + "progress": op_state.num_completed_tasks, + "total": op.num_outputs_total(), + "state": state, + } + for i, (op, op_state) in enumerate(self._topology.items()) + }, + } + + def _update_stats_metrics(self, state: str, force_update: bool = False): + StatsManager.update_execution_metrics( + self._dataset_tag, + [op.metrics for op in self._topology], + self._get_operator_tags(), + self._get_state_dict(state=state), + force_update=force_update, + ) + + +def _validate_dag(dag: PhysicalOperator, limits: ExecutionResources) -> None: + """Raises an exception on invalid DAGs. + + It checks if the the sum of min actor pool sizes are larger than the resource + limit, as well as other unsupported resource configurations. + + This should be called prior to creating the topology from the DAG. + + Args: + dag: The DAG to validate. + limits: The limits to validate against. + """ + + seen = set() + + def walk(op): + seen.add(op) + for parent in op.input_dependencies: + if parent not in seen: + yield from walk(parent) + yield op + + base_usage = ExecutionResources(cpu=1) + for op in walk(dag): + base_usage = base_usage.add(op.base_resource_usage()) + + if not base_usage.satisfies_limit(limits): + error_message = ( + "The current cluster doesn't have the required resources to execute your " + "Dataset pipeline:\n" + ) + if base_usage.cpu > limits.cpu: + error_message += ( + f"- Your application needs {base_usage.cpu} CPU(s), but your cluster " + f"only has {limits.cpu}.\n" + ) + if base_usage.gpu > limits.gpu: + error_message += ( + f"- Your application needs {base_usage.gpu} GPU(s), but your cluster " + f"only has {limits.gpu}.\n" + ) + if base_usage.object_store_memory > limits.object_store_memory: + error_message += ( + f"- Your application needs {base_usage.object_store_memory}B object " + f"store memory, but your cluster only has " + f"{limits.object_store_memory}B.\n" + ) + raise ValueError(error_message.strip()) + + +def _debug_dump_topology(topology: Topology, resource_manager: ResourceManager) -> None: + """Log current execution state for the topology for debugging. + + Args: + topology: The topology to debug. + resource_manager: The resource manager for this topology. + """ + logger.debug("Execution Progress:") + for i, (op, state) in enumerate(topology.items()): + logger.debug( + f"{i}: {state.summary_str(resource_manager)}, " + f"Blocks Outputted: {state.num_completed_tasks}/{op.num_outputs_total()}" + ) + + +def _log_op_metrics(topology: Topology) -> None: + """Logs the metrics of each operator. + + Args: + topology: The topology to debug. + """ + log_str = "Operator Metrics:\n" + for op in topology: + log_str += f"{op.name}: {op.metrics.as_dict()}\n" + logger.debug(log_str) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor_state.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor_state.py new file mode 100644 index 0000000000000000000000000000000000000000..f91c4f536da6a5a62d5d2d4ab3c5feab2ac8edd6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor_state.py @@ -0,0 +1,681 @@ +"""Contains classes that encapsulate streaming executor state. + +This is split out from streaming_executor.py to facilitate better unit testing. +""" + +import logging +import math +import threading +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import ray +from ray.data._internal.execution.autoscaler import Autoscaler +from ray.data._internal.execution.backpressure_policy import BackpressurePolicy +from ray.data._internal.execution.bundle_queue import create_bundle_queue +from ray.data._internal.execution.interfaces import ( + ExecutionOptions, + ExecutionResources, + PhysicalOperator, + RefBundle, +) +from ray.data._internal.execution.interfaces.physical_operator import ( + DataOpTask, + MetadataOpTask, + OpTask, + Waitable, +) +from ray.data._internal.execution.operators.base_physical_operator import ( + AllToAllOperator, +) +from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer +from ray.data._internal.execution.resource_manager import ResourceManager +from ray.data._internal.progress_bar import ProgressBar +from ray.data.context import DataContext + +logger = logging.getLogger(__name__) + +# Holds the full execution state of the streaming topology. It's a dict mapping each +# operator to tracked streaming exec state. +Topology = Dict[PhysicalOperator, "OpState"] + + +class OpBufferQueue: + """A FIFO queue to buffer RefBundles between upstream and downstream operators. + This class is thread-safe. + """ + + def __init__(self): + self._num_blocks = 0 + self._queue = create_bundle_queue() + self._num_per_split = defaultdict(int) + self._lock = threading.Lock() + # Used to buffer output RefBundles indexed by output splits. + self._outputs_by_split = defaultdict(create_bundle_queue) + super().__init__() + + @property + def memory_usage(self) -> int: + """The total memory usage of the queue in bytes.""" + with self._lock: + # The split queues contain bundles popped from the main queue. So, a bundle + # will either be in the main queue or in one of the split queues, and we + # don't need to worry about double counting. + return self._queue.estimate_size_bytes() + sum( + split_queue.estimate_size_bytes() + for split_queue in self._outputs_by_split.values() + ) + + @property + def num_blocks(self) -> int: + """The total number of blocks in the queue.""" + with self._lock: + return self._num_blocks + + def __len__(self): + with self._lock: + return len(self._queue) + + def has_next(self, output_split_idx: Optional[int] = None) -> bool: + """Whether next RefBundle is available. + + Args: + output_split_idx: If specified, only check ref bundles with the + given output split. + """ + if output_split_idx is None: + with self._lock: + return len(self._queue) > 0 + else: + with self._lock: + return self._num_per_split[output_split_idx] > 0 + + def append(self, ref: RefBundle): + """Append a RefBundle to the queue.""" + with self._lock: + self._queue.add(ref) + self._num_blocks += len(ref.blocks) + if ref.output_split_idx is not None: + self._num_per_split[ref.output_split_idx] += 1 + + def pop(self, output_split_idx: Optional[int] = None) -> Optional[RefBundle]: + """Pop a RefBundle from the queue. + Args: + output_split_idx: If specified, only pop a RefBundle + with the given output split. + Returns: + A RefBundle if available, otherwise None. + """ + ret = None + if output_split_idx is None: + try: + with self._lock: + ret = self._queue.pop() + except IndexError: + pass + else: + with self._lock: + split_queue = self._outputs_by_split[output_split_idx] + if len(split_queue) == 0: + # Move all ref bundles to their indexed queues + # Note, the reason why we do indexing here instead of in the append + # is because only the last `OpBufferQueue` in the DAG, which will call + # pop with output_split_idx, needs indexing. + # If we also index the `OpBufferQueue`s in the middle, we cannot + # preserve the order of ref bundles with different output splits. + with self._lock: + while len(self._queue) > 0: + ref = self._queue.pop() + self._outputs_by_split[ref.output_split_idx].add(ref) + try: + ret = split_queue.pop() + except IndexError: + pass + if ret is None: + return None + with self._lock: + self._num_blocks -= len(ret.blocks) + if ret.output_split_idx is not None: + self._num_per_split[ret.output_split_idx] -= 1 + return ret + + def clear(self): + with self._lock: + self._queue.clear() + self._num_blocks = 0 + self._num_per_split.clear() + + +@dataclass +class OpSchedulingStatus: + """The scheduling status of an operator. + + This will be updated each time when StreamingExecutor makes + a scheduling decision, i.e., in each `select_operator_to_run` + call. + """ + + # Whether the op was selected to run in the last scheduling + # decision. + selected: bool = False + # Whether the op was considered runnable in the last scheduling + # decision. + runnable: bool = False + # Whether the resources were sufficient for the operator to run + # in the last scheduling decision. + under_resource_limits: bool = False + + +class OpState: + """The execution state tracked for each PhysicalOperator. + + This tracks state to manage input and output buffering for StreamingExecutor and + progress bars, which is separate from execution state internal to the operators. + + Note: we use the `deque` data structure here because it is thread-safe, enabling + operator queues to be shared across threads. + """ + + def __init__(self, op: PhysicalOperator, inqueues: List[OpBufferQueue]): + # Each inqueue is connected to another operator's outqueue. + assert len(inqueues) == len(op.input_dependencies), (op, inqueues) + self.inqueues: List[OpBufferQueue] = inqueues + # The outqueue is connected to another operator's inqueue (they physically + # share the same Python list reference). + # + # Note: this queue is also accessed concurrently from the consumer thread. + # (in addition to the streaming executor thread). Hence, it must be a + # thread-safe type such as `deque`. + self.outqueue: OpBufferQueue = OpBufferQueue() + self.op = op + self.progress_bar = None + self.num_completed_tasks = 0 + self.inputs_done_called = False + # Tracks whether `input_done` is called for each input op. + self.input_done_called = [False] * len(op.input_dependencies) + # Used for StreamingExecutor to signal exception or end of execution + self._finished: bool = False + self._exception: Optional[Exception] = None + self._scheduling_status = OpSchedulingStatus() + + def __repr__(self): + return f"OpState({self.op.name})" + + def initialize_progress_bars(self, index: int, verbose_progress: bool) -> int: + """Create progress bars at the given index (line offset in console). + + For AllToAllOperator, zero or more sub progress bar would be created. + Return the number of enabled progress bars created for this operator. + """ + is_all_to_all = isinstance(self.op, AllToAllOperator) + # Only show 1:1 ops when in verbose progress mode. + ctx = DataContext.get_current() + progress_bar_enabled = ( + ctx.enable_progress_bars + and ctx.enable_operator_progress_bars + and (is_all_to_all or verbose_progress) + ) + self.progress_bar = ProgressBar( + "- " + self.op.name, + self.op.num_output_rows_total(), + unit="row", + position=index, + enabled=progress_bar_enabled, + ) + num_progress_bars = 1 + if is_all_to_all: + # Initialize must be called for sub progress bars, even the + # bars are not enabled via the DataContext. + num_progress_bars += self.op.initialize_sub_progress_bars(index + 1) + return num_progress_bars if progress_bar_enabled else 0 + + def close_progress_bars(self): + """Close all progress bars for this operator.""" + if self.progress_bar: + self.progress_bar.close() + if isinstance(self.op, AllToAllOperator): + self.op.close_sub_progress_bars() + + def num_queued(self) -> int: + """Return the number of queued bundles across all inqueues.""" + return sum(len(q) for q in self.inqueues) + + def num_processing(self): + """Return the number of bundles currently in processing for this operator.""" + return self.op.num_active_tasks() + self.op.internal_queue_size() + + def add_output(self, ref: RefBundle) -> None: + """Move a bundle produced by the operator to its outqueue.""" + self.outqueue.append(ref) + self.num_completed_tasks += 1 + if self.progress_bar: + assert ( + ref.num_rows() is not None + ), "RefBundle must have a valid number of rows" + self.progress_bar.update(ref.num_rows(), self.op.num_output_rows_total()) + + def refresh_progress_bar(self, resource_manager: ResourceManager) -> None: + """Update the console with the latest operator progress.""" + if self.progress_bar: + self.progress_bar.set_description(self.summary_str(resource_manager)) + self.progress_bar.refresh() + + def summary_str(self, resource_manager: ResourceManager) -> str: + # Active tasks + active = self.op.num_active_tasks() + desc = f"- {self.op.name}: Tasks: {active}" + if ( + self.op._in_task_submission_backpressure + or self.op._in_task_output_backpressure + ): + desc += " [backpressured]" + + # Actors info + desc += self.op.actor_info_progress_str() + + # Queued blocks + queued = self.num_queued() + self.op.internal_queue_size() + desc += f"; Queued blocks: {queued}" + desc += f"; Resources: {resource_manager.get_op_usage_str(self.op)}" + + # Any additional operator specific information. + suffix = self.op.progress_str() + if suffix: + desc += f"; {suffix}" + + return desc + + def dispatch_next_task(self) -> None: + """Move a bundle from the operator inqueue to the operator itself.""" + for i, inqueue in enumerate(self.inqueues): + ref = inqueue.pop() + if ref is not None: + self.op.add_input(ref, input_index=i) + return + assert False, "Nothing to dispatch" + + def get_output_blocking(self, output_split_idx: Optional[int]) -> RefBundle: + """Get an item from this node's output queue, blocking as needed. + + Returns: + The RefBundle from the output queue, or an error / end of stream indicator. + + Raises: + StopIteration: If all outputs are already consumed. + Exception: If there was an exception raised during execution. + """ + while True: + # Check if StreamingExecutor has caught an exception or is done execution. + if self._exception is not None: + raise self._exception + elif self._finished and not self.outqueue.has_next(output_split_idx): + raise StopIteration() + ref = self.outqueue.pop(output_split_idx) + if ref is not None: + return ref + time.sleep(0.01) + + def inqueue_memory_usage(self) -> int: + """Return the object store memory of this operator's inqueue.""" + total = 0 + for op, inq in zip(self.op.input_dependencies, self.inqueues): + # Exclude existing input data items from dynamic memory usage. + if not isinstance(op, InputDataBuffer): + total += inq.memory_usage + return total + + def outqueue_memory_usage(self) -> int: + """Return the object store memory of this operator's outqueue.""" + return self.outqueue.memory_usage + + def outqueue_num_blocks(self) -> int: + """Return the number of blocks in this operator's outqueue.""" + return self.outqueue.num_blocks + + def mark_finished(self, exception: Optional[Exception] = None): + """Marks this operator as finished. Used for exiting get_output_blocking.""" + if exception is None: + self._finished = True + else: + self._exception = exception + + +def build_streaming_topology( + dag: PhysicalOperator, options: ExecutionOptions +) -> Tuple[Topology, int]: + """Instantiate the streaming operator state topology for the given DAG. + + This involves creating the operator state for each operator in the DAG, + registering it with this class, and wiring up the inqueues/outqueues of + dependent operator states. + + Args: + dag: The operator DAG to instantiate. + options: The execution options to use to start operators. + + Returns: + The topology dict holding the streaming execution state. + The number of progress bars initialized so far. + """ + + topology: Topology = {} + + # DFS walk to wire up operator states. + def setup_state(op: PhysicalOperator) -> OpState: + if op in topology: + raise ValueError("An operator can only be present in a topology once.") + + # Wire up the input outqueues to this op's inqueues. + inqueues = [] + for i, parent in enumerate(op.input_dependencies): + parent_state = setup_state(parent) + inqueues.append(parent_state.outqueue) + + # Create state. + op_state = OpState(op, inqueues) + topology[op] = op_state + op.start(options) + return op_state + + setup_state(dag) + + # Create the progress bars starting from the first operator to run. + # Note that the topology dict is in topological sort order. Index zero is reserved + # for global progress information. + i = 1 + for op_state in list(topology.values()): + if not isinstance(op_state.op, InputDataBuffer): + i += op_state.initialize_progress_bars(i, options.verbose_progress) + + return (topology, i) + + +def process_completed_tasks( + topology: Topology, + resource_manager: ResourceManager, + max_errored_blocks: int, +) -> int: + """Process any newly completed tasks. To update operator + states, call `update_operator_states()` afterwards. + + Args: + topology: The toplogy of operators. + backpressure_policies: The backpressure policies to use. + max_errored_blocks: Max number of errored blocks to allow, + unlimited if negative. + Returns: + The number of errored blocks. + """ + + # All active tasks, keyed by their waitables. + active_tasks: Dict[Waitable, Tuple[OpState, OpTask]] = {} + for op, state in topology.items(): + for task in op.get_active_tasks(): + active_tasks[task.get_waitable()] = (state, task) + + max_bytes_to_read_per_op: Dict[OpState, int] = {} + if resource_manager.op_resource_allocator_enabled(): + for op, state in topology.items(): + max_bytes_to_read = ( + resource_manager.op_resource_allocator.max_task_output_bytes_to_read(op) + ) + op._in_task_output_backpressure = max_bytes_to_read == 0 + if max_bytes_to_read is not None: + max_bytes_to_read_per_op[state] = max_bytes_to_read + + # Process completed Ray tasks and notify operators. + num_errored_blocks = 0 + if active_tasks: + ready, _ = ray.wait( + list(active_tasks.keys()), + num_returns=len(active_tasks), + fetch_local=False, + timeout=0.1, + ) + + # Organize tasks by the operator they belong to, and sort them by task index. + # So that we'll process them in a deterministic order. + # This is because OpResourceAllocator may limit the number of blocks to read + # per operator. In this case, we want to have fewer tasks finish quickly and + # yield resources, instead of having all tasks output blocks together. + ready_tasks_by_op = defaultdict(list) + for ref in ready: + state, task = active_tasks[ref] + ready_tasks_by_op[state].append(task) + + for state, ready_tasks in ready_tasks_by_op.items(): + ready_tasks = sorted(ready_tasks, key=lambda t: t.task_index()) + for task in ready_tasks: + if isinstance(task, DataOpTask): + try: + bytes_read = task.on_data_ready( + max_bytes_to_read_per_op.get(state, None) + ) + if state in max_bytes_to_read_per_op: + max_bytes_to_read_per_op[state] -= bytes_read + except Exception as e: + num_errored_blocks += 1 + should_ignore = ( + max_errored_blocks < 0 + or max_errored_blocks >= num_errored_blocks + ) + error_message = ( + "An exception was raised from a task of " + f'operator "{state.op.name}".' + ) + if should_ignore: + remaining = ( + max_errored_blocks - num_errored_blocks + if max_errored_blocks >= 0 + else "unlimited" + ) + error_message += ( + " Ignoring this exception with remaining" + f" max_errored_blocks={remaining}." + ) + logger.error(error_message, exc_info=e) + else: + error_message += ( + " Dataset execution will now abort." + " To ignore this exception and continue, set" + " DataContext.max_errored_blocks." + ) + logger.error(error_message) + raise e from None + else: + assert isinstance(task, MetadataOpTask) + task.on_task_finished() + + # Pull any operator outputs into the streaming op state. + for op, op_state in topology.items(): + while op.has_next(): + op_state.add_output(op.get_next()) + + return num_errored_blocks + + +def update_operator_states(topology: Topology) -> None: + """Update operator states accordingly for newly completed tasks. + Should be called after `process_completed_tasks()`.""" + + # Call inputs_done() on ops where no more inputs are coming. + for op, op_state in topology.items(): + if op_state.inputs_done_called: + continue + all_inputs_done = True + for idx, dep in enumerate(op.input_dependencies): + if dep.completed() and not topology[dep].outqueue: + if not op_state.input_done_called[idx]: + op.input_done(idx) + op_state.input_done_called[idx] = True + else: + all_inputs_done = False + + if all_inputs_done: + op.all_inputs_done() + op_state.inputs_done_called = True + + # Traverse the topology in reverse topological order. + # For each op, if all of its downstream operators have completed. + # call mark_execution_completed() to also complete this op. + for op, op_state in reversed(list(topology.items())): + if op.completed(): + continue + dependents_completed = len(op.output_dependencies) > 0 and all( + dep.completed() for dep in op.output_dependencies + ) + if dependents_completed: + op.mark_execution_completed() + + +def select_operator_to_run( + topology: Topology, + resource_manager: ResourceManager, + backpressure_policies: List[BackpressurePolicy], + autoscaler: Autoscaler, + ensure_at_least_one_running: bool, +) -> Optional[PhysicalOperator]: + """Select an operator to run, if possible. + + The objective of this function is to maximize the throughput of the overall + pipeline, subject to defined memory and parallelism limits. + + This is currently implemented by applying backpressure on operators that are + producing outputs faster than they are consuming them `len(outqueue)`, as well as + operators with a large number of running tasks `num_processing()`. + + Note that memory limits also apply to the outqueue of the output operator. This + provides backpressure if the consumer is slow. However, once a bundle is returned + to the user, it is no longer tracked. + """ + # Filter to ops that are eligible for execution. + ops = [] + for op, state in topology.items(): + if resource_manager.op_resource_allocator_enabled(): + under_resource_limits = ( + resource_manager.op_resource_allocator.can_submit_new_task(op) + ) + else: + under_resource_limits = _execution_allowed(op, resource_manager) + in_backpressure = not under_resource_limits or any( + not p.can_add_input(op) for p in backpressure_policies + ) + op_runnable = False + if ( + not in_backpressure + and not op.completed() + and state.num_queued() > 0 + and op.should_add_input() + ): + ops.append(op) + op_runnable = True + # Update scheduling status + state._scheduling_status = OpSchedulingStatus( + selected=False, + runnable=op_runnable, + under_resource_limits=under_resource_limits, + ) + + # Signal whether op in backpressure for stats collections + op.notify_in_task_submission_backpressure(in_backpressure) + + # To ensure liveness, allow at least 1 op to run regardless of limits. This is + # gated on `ensure_at_least_one_running`, which is set if the consumer is blocked. + if ( + ensure_at_least_one_running + and not ops + and all(op.num_active_tasks() == 0 for op in topology) + ): + # The topology is entirely idle, so choose from all ready ops ignoring limits. + ops = [ + op + for op, state in topology.items() + if state.num_queued() > 0 and not op.completed() + ] + + selected_op = None + if ops: + # Run metadata-only operators first. After that, choose the operator with the + # least memory usage. + selected_op = min( + ops, + key=lambda op: ( + not op.throttling_disabled(), + resource_manager.get_op_usage(op).object_store_memory, + ), + ) + topology[selected_op]._scheduling_status.selected = True + autoscaler.try_trigger_scaling() + return selected_op + + +def _execution_allowed(op: PhysicalOperator, resource_manager: ResourceManager) -> bool: + """Return whether an operator is allowed to execute given resource usage. + + Operators are throttled globally based on CPU and GPU limits for the stream. + + For an N operator DAG, we only throttle the kth operator (in the source-to-sink + ordering) on object store utilization if the cumulative object store utilization + for the kth operator and every operator downstream from it is greater than + k/N * global_limit; i.e., the N - k operator sub-DAG is using more object store + memory than it's share. + + Args: + op: The operator to check. + resource_manager: The ResourceManager of the current dataset. + + Returns: + Whether the op is allowed to run. + """ + if op.throttling_disabled(): + return True + + global_usage = resource_manager.get_global_usage() + global_limits = resource_manager.get_global_limits() + + # To avoid starvation problems when dealing with fractional resource types, + # convert all quantities to integer (0 or 1) for deciding admissibility. This + # allows operators with non-integral requests to slightly overshoot the limit. + global_floored = ExecutionResources( + cpu=math.floor(global_usage.cpu or 0), + gpu=math.floor(global_usage.gpu or 0), + object_store_memory=global_usage.object_store_memory, + ) + inc = op.incremental_resource_usage() + if inc.cpu and inc.gpu: + raise NotImplementedError( + "Operator incremental resource usage cannot specify both CPU " + "and GPU at the same time, since it may cause deadlock." + ) + + # Ignore the scale of CPU and GPU requests, i.e., treating them as either 1 or 0. + # This ensures operators don't get starved due to the shape of their resource + # requests. + inc_indicator = ExecutionResources( + cpu=1 if inc.cpu else 0, + gpu=1 if inc.gpu else 0, + object_store_memory=0, + ) + + # Under global limits; always allow. + new_usage = global_floored.add(inc_indicator) + if new_usage.satisfies_limit(global_limits): + return True + + # We're over global limits, but execution may still be allowed if memory is the + # only bottleneck and this wouldn't impact downstream memory limits. This avoids + # stalling the execution for memory bottlenecks that occur upstream. + # See for more context: https://github.com/ray-project/ray/pull/32673 + global_limits_sans_memory = ExecutionResources.for_limits( + cpu=global_limits.cpu, gpu=global_limits.gpu + ) + global_ok_sans_memory = new_usage.satisfies_limit(global_limits_sans_memory) + downstream_memory = resource_manager.get_downstream_object_store_memory(op) + downstream_limit = global_limits.scale(resource_manager.get_downstream_fraction(op)) + downstream_memory_ok = ExecutionResources( + object_store_memory=downstream_memory + ).satisfies_limit(downstream_limit) + + return global_ok_sans_memory and downstream_memory_ok diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/util.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/util.py new file mode 100644 index 0000000000000000000000000000000000000000..d3bf3d9f1f54d0f9da6f7b55b2160e1228d43764 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/util.py @@ -0,0 +1,80 @@ +from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING, Any, List + +import ray +from ray.data.block import BlockAccessor, CallableClass + +if TYPE_CHECKING: + from ray.data._internal.execution.interfaces import RefBundle + + +def make_ref_bundles(simple_data: List[List[Any]]) -> List["RefBundle"]: + """Create ref bundles from a list of block data. + + One bundle is created for each input block. + """ + import pandas as pd + + from ray.data._internal.execution.interfaces import RefBundle + + output = [] + for block in simple_data: + block = pd.DataFrame({"id": block}) + output.append( + RefBundle( + [ + ( + ray.put(block), + BlockAccessor.for_block(block).get_metadata(), + ) + ], + owns_blocks=True, + ) + ) + return output + + +memory_units = ["B", "KB", "MB", "GB", "TB", "PB"] + + +def memory_string(num_bytes: float) -> str: + """Return a human-readable memory string for the given amount of bytes.""" + k = 0 + while num_bytes >= 1024 and k < len(memory_units) - 1: + num_bytes /= 1024 + k += 1 + return f"{num_bytes:.1f}{memory_units[k]}" + + +def locality_string(locality_hits: int, locality_misses) -> str: + """Return a human-readable string for object locality stats.""" + if not locality_misses: + return "[all objects local]" + return f"[{locality_hits}/{locality_hits + locality_misses} objects local]" + + +def make_callable_class_concurrent(callable_cls: CallableClass) -> CallableClass: + """Returns a thread-safe CallableClass with the same logic as the provided + `callable_cls`. + + This function allows the usage of concurrent actors by safeguarding user logic + behind a separate thread. + + This allows batch slicing and formatting to occur concurrently, to overlap with the + user provided UDF. + """ + + class _Wrapper(callable_cls): + def __init__(self, *args, **kwargs): + self.thread_pool_executor = ThreadPoolExecutor(max_workers=1) + super().__init__(*args, **kwargs) + + def __repr__(self): + return super().__repr__() + + def __call__(self, *args, **kwargs): + # ThreadPoolExecutor will reuse the same thread for every submit call. + future = self.thread_pool_executor.submit(super().__call__, *args, **kwargs) + return future.result() + + return _Wrapper diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..b2cf448f030d2165b56b14ebfc0539ddeecb778a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/pull_based_shuffle_task_scheduler.py @@ -0,0 +1,149 @@ +import logging +from typing import Any, Dict, List, Optional, Tuple + +from ray._private.ray_constants import CALLER_MEMORY_USAGE_PER_OBJECT_REF +from ray.data._internal.execution.interfaces import RefBundle, TaskContext +from ray.data._internal.planner.exchange.interfaces import ( + ExchangeTaskScheduler, + ExchangeTaskSpec, +) +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.stats import StatsDict +from ray.data._internal.util import convert_bytes_to_human_readable_str + +logger = logging.getLogger(__name__) + + +class PullBasedShuffleTaskScheduler(ExchangeTaskScheduler): + """ + The pull-based map-reduce shuffle scheduler. + + Map tasks are first scheduled to generate map output blocks. After all map output + are generated, then reduce tasks are scheduled to combine map output blocks + together. + + The concept here is similar to + "MapReduce: Simplified Data Processing on Large Clusters" + (https://dl.acm.org/doi/10.1145/1327452.1327492). + """ + + def execute( + self, + refs: List[RefBundle], + output_num_blocks: int, + task_ctx: TaskContext, + map_ray_remote_args: Optional[Dict[str, Any]] = None, + reduce_ray_remote_args: Optional[Dict[str, Any]] = None, + _debug_limit_execution_to_num_blocks: Optional[int] = None, + ) -> Tuple[List[RefBundle], StatsDict]: + + # TODO: eagerly delete the input and map output block references in order to + # eagerly release the blocks' memory. + input_blocks_list = [] + for ref_bundle in refs: + input_blocks_list.extend(ref_bundle.block_refs) + input_num_blocks = len(input_blocks_list) + input_owned = all(b.owns_blocks for b in refs) + + caller_memory_usage = ( + input_num_blocks * output_num_blocks * CALLER_MEMORY_USAGE_PER_OBJECT_REF + ) + self.warn_on_driver_memory_usage( + caller_memory_usage, + "Execution is estimated to use at least " + f"{convert_bytes_to_human_readable_str(caller_memory_usage)} " + "of driver memory. Ensure that the driver machine has at least " + "this much memory to ensure job completion.\n\n" + "To reduce the " + "amount of driver memory needed, enable push-based shuffle using " + "RAY_DATA_PUSH_BASED_SHUFFLE=1 " + "(https://docs.ray.io/en/latest/data/performance-tips.html" + ").", + ) + + if map_ray_remote_args is None: + map_ray_remote_args = {} + if reduce_ray_remote_args is None: + reduce_ray_remote_args = {} + if "scheduling_strategy" not in reduce_ray_remote_args: + reduce_ray_remote_args = reduce_ray_remote_args.copy() + reduce_ray_remote_args["scheduling_strategy"] = "SPREAD" + + shuffle_map = cached_remote_fn(self._exchange_spec.map) + shuffle_reduce = cached_remote_fn(self._exchange_spec.reduce) + + sub_progress_bar_dict = task_ctx.sub_progress_bar_dict + bar_name = ExchangeTaskSpec.MAP_SUB_PROGRESS_BAR_NAME + assert bar_name in sub_progress_bar_dict, sub_progress_bar_dict + map_bar = sub_progress_bar_dict[bar_name] + + if _debug_limit_execution_to_num_blocks is not None: + input_blocks_list = input_blocks_list[:_debug_limit_execution_to_num_blocks] + logger.debug(f"Limiting execution to {len(input_blocks_list)} map tasks") + shuffle_map_out = [ + shuffle_map.options( + **map_ray_remote_args, + num_returns=1 + output_num_blocks, + ).remote(i, block, output_num_blocks, *self._exchange_spec._map_args) + for i, block in enumerate(input_blocks_list) + ] + + # The first item returned is the BlockMetadata. + shuffle_map_metadata = [] + for i, refs in enumerate(shuffle_map_out): + shuffle_map_metadata.append(refs[-1]) + shuffle_map_out[i] = refs[:-1] + + if _debug_limit_execution_to_num_blocks is not None: + while len(shuffle_map_out) < output_num_blocks: + # Repeat the first map task's results. + shuffle_map_out.append(shuffle_map_out[0][:]) + + shuffle_map_metadata = map_bar.fetch_until_complete(shuffle_map_metadata) + + self.warn_on_high_local_memory_store_usage() + + bar_name = ExchangeTaskSpec.REDUCE_SUB_PROGRESS_BAR_NAME + assert bar_name in sub_progress_bar_dict, sub_progress_bar_dict + reduce_bar = sub_progress_bar_dict[bar_name] + + if _debug_limit_execution_to_num_blocks is not None: + output_num_blocks = _debug_limit_execution_to_num_blocks + logger.debug(f"Limiting execution to {output_num_blocks} reduce tasks") + shuffle_reduce_out = [ + shuffle_reduce.options(**reduce_ray_remote_args, num_returns=2).remote( + *self._exchange_spec._reduce_args, + *[shuffle_map_out[i][j] for i in range(input_num_blocks)], + ) + for j in range(output_num_blocks) + ] + + # Release map task outputs from the Ray object store. + del shuffle_map_out + + new_blocks, new_metadata = [], [] + if shuffle_reduce_out: + new_blocks, new_metadata = zip(*shuffle_reduce_out) + new_metadata = reduce_bar.fetch_until_complete(list(new_metadata)) + + self.warn_on_high_local_memory_store_usage() + + output = [] + for block, meta in zip(new_blocks, new_metadata): + output.append( + RefBundle( + [ + ( + block, + meta, + ) + ], + owns_blocks=input_owned, + ) + ) + stats = { + "map": shuffle_map_metadata, + "reduce": new_metadata, + } + + return (output, stats) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/shuffle_task_spec.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/shuffle_task_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..51601ebbce04bd4501df5f369e811549e48ada28 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/shuffle_task_spec.py @@ -0,0 +1,132 @@ +import logging +import math +from typing import Callable, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.planner.exchange.interfaces import ExchangeTaskSpec +from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata +from ray.data.context import MAX_SAFE_BLOCK_SIZE_FACTOR + +logger = logging.getLogger(__name__) + + +class ShuffleTaskSpec(ExchangeTaskSpec): + """ + The implementation for shuffle tasks. + + This is used by random_shuffle() and repartition(). + """ + + SPLIT_REPARTITION_SUB_PROGRESS_BAR_NAME = "Split Repartition" + + def __init__( + self, + target_shuffle_max_block_size: int, + random_shuffle: bool = False, + random_seed: Optional[int] = None, + upstream_map_fn: Optional[Callable[[Iterable[Block]], Iterable[Block]]] = None, + ): + super().__init__( + map_args=[ + target_shuffle_max_block_size, + upstream_map_fn, + random_shuffle, + random_seed, + ], + reduce_args=[random_shuffle, random_seed], + ) + + @staticmethod + def map( + idx: int, + block: Block, + output_num_blocks: int, + target_shuffle_max_block_size: int, + upstream_map_fn: Optional[Callable[[Iterable[Block]], Iterable[Block]]], + random_shuffle: bool, + random_seed: Optional[int], + ) -> List[Union[BlockMetadata, Block]]: + stats = BlockExecStats.builder() + if upstream_map_fn: + # TODO: Support dynamic block splitting in + # all-to-all ops, to avoid having to re-fuse + # upstream blocks together. + upstream_map_iter = upstream_map_fn([block]) + mapped_block = next(upstream_map_iter) + builder = BlockAccessor.for_block(mapped_block).builder() + builder.add_block(mapped_block) + for mapped_block in upstream_map_iter: + builder.add_block(mapped_block) + # Drop the upstream inputs to reduce memory usage. + del mapped_block + block = builder.build() + block = BlockAccessor.for_block(block) + if ( + block.size_bytes() + > MAX_SAFE_BLOCK_SIZE_FACTOR * target_shuffle_max_block_size + ): + logger.warning( + "Input block to map task has size " + f"{block.size_bytes() // (1024 * 1024)}MiB, which exceeds " + "DataContext.get_current().target_shuffle_max_block_size=" + f"{target_shuffle_max_block_size // (1024 * 1024)}MiB. " + "This can lead to out-of-memory errors and can happen " + "when map tasks are fused to the shuffle operation. " + "To prevent fusion, call Dataset.materialize() on the " + "dataset before shuffling." + ) + + # Randomize the distribution of records to blocks. + if random_shuffle: + seed_i = random_seed + idx if random_seed is not None else None + block = block.random_shuffle(seed_i) + block = BlockAccessor.for_block(block) + + # Build a list of slices to return. It's okay to put the results in a + # list instead of yielding them as a generator because slicing the + # ArrowBlock is zero-copy. + slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) + slices = [] + for i in range(output_num_blocks): + slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz)) + + # Randomize the distribution order of the blocks (this prevents empty + # outputs when input blocks are very small). + if random_shuffle: + random = np.random.RandomState(seed_i) + random.shuffle(slices) + + num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) + assert num_rows == block.num_rows(), (num_rows, block.num_rows()) + metadata = block.get_metadata(input_files=None, exec_stats=stats.build()) + return slices + [metadata] + + @staticmethod + def reduce( + random_shuffle: bool, + random_seed: Optional[int], + *mapper_outputs: List[Block], + partial_reduce: bool = False, + ) -> Tuple[Block, BlockMetadata]: + # TODO: Support fusion with other downstream operators. + stats = BlockExecStats.builder() + builder = DelegatingBlockBuilder() + for block in mapper_outputs: + builder.add_block(block) + new_block = builder.build() + accessor = BlockAccessor.for_block(new_block) + if random_shuffle: + new_block = accessor.random_shuffle( + random_seed if random_seed is not None else None + ) + accessor = BlockAccessor.for_block(new_block) + new_metadata = BlockMetadata( + num_rows=accessor.num_rows(), + size_bytes=accessor.size_bytes(), + schema=accessor.schema(), + input_files=None, + exec_stats=stats.build(), + ) + return new_block, new_metadata diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/sort_task_spec.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/sort_task_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..5f79a7885cbf254299774c92d160122625c25a7a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/sort_task_spec.py @@ -0,0 +1,230 @@ +from typing import TYPE_CHECKING, List, Optional, Tuple, TypeVar, Union + +import numpy as np + +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.planner.exchange.interfaces import ExchangeTaskSpec +from ray.data._internal.progress_bar import ProgressBar +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.table_block import TableBlockAccessor +from ray.data._internal.util import NULL_SENTINEL +from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata +from ray.types import ObjectRef + +T = TypeVar("T") + +if TYPE_CHECKING: + import pyarrow + + +class SortKey: + """SortKey class to convert between different sort args formats.""" + + def __init__( + self, + key: Optional[Union[str, List[str]]] = None, + descending: Union[bool, List[bool]] = False, + boundaries: Optional[List[T]] = None, + ): + if key is None: + key = [] + if isinstance(key, str): + key = [key] + if not (isinstance(key, list) and all(isinstance(k, str) for k in key)): + raise ValueError( + f"Key must be a string or a list of strings, but got {key}." + ) + if isinstance(descending, bool): + descending = [descending for _ in key] + elif isinstance(descending, list): + if len(descending) != len(key): + raise ValueError( + "Length of `descending` does not match the length of the key." + ) + self._columns = key + self._descending = descending + if boundaries: + for item in boundaries: + if not isinstance(item, (int, float)): + raise ValueError( + "The type of items in boundaries must be int or float." + ) + boundaries = list(set(boundaries)) + boundaries.sort() + self._boundaries = boundaries + + def get_columns(self) -> List[str]: + return self._columns + + def get_descending(self) -> List[bool]: + return self._descending + + def to_arrow_sort_args(self) -> List[Tuple[str, str]]: + return [ + (key, "descending" if desc else "ascending") + for key, desc in zip(self._columns, self._descending) + ] + + def to_pandas_sort_args(self) -> Tuple[List[str], List[bool]]: + return self._columns, [not desc for desc in self._descending] + + def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]): + """Check the key function is valid on the given schema.""" + if schema is None: + # Dataset is empty/cleared, validation not possible. + return + + if self._columns and len(schema.names) > 0: + schema_names_set = set(schema.names) + for column in self._columns: + if column not in schema_names_set: + raise ValueError( + f"You specified the column '{column}', but there's no such " + "column in the dataset. The dataset has columns: " + f"{schema_names_set}" + ) + + @property + def boundaries(self): + return self._boundaries + + +class SortTaskSpec(ExchangeTaskSpec): + """ + The implementation for distributed sort tasks. + + The algorithm is similar to [External Merge Sort] + (https://en.wikipedia.org/wiki/External_sorting). + Sorting is done in 3 steps: sampling, sorting individual blocks, and + merging sorted blocks. + + Sampling (`sample_boundaries`): we get a number of sample items from each block, + sort them, and use them to compute boundaries that would partition all items into + approximately equal ranges. + + Sorting (`map`): each block is sorted locally, then partitioned into smaller + blocks according to the boundaries. Each partitioned block is passed to a merge + task. + + Merging (`reduce`): a merge task would receive a block from every worker that + consists of items in a certain range. It then merges the sorted blocks into one + sorted block and becomes part of the new, sorted block. + """ + + SORT_SAMPLE_SUB_PROGRESS_BAR_NAME = "Sort Sample" + + def __init__( + self, + boundaries: List[T], + sort_key: SortKey, + batch_format: str, + ): + super().__init__( + map_args=[boundaries, sort_key], + reduce_args=[sort_key, batch_format], + ) + + @staticmethod + def map( + idx: int, + block: Block, + output_num_blocks: int, + boundaries: List[T], + sort_key: SortKey, + ) -> List[Union[BlockMetadata, Block]]: + stats = BlockExecStats.builder() + out = BlockAccessor.for_block(block).sort_and_partition(boundaries, sort_key) + meta = BlockAccessor.for_block(block).get_metadata(exec_stats=stats.build()) + return out + [meta] + + @staticmethod + def reduce( + sort_key: SortKey, + batch_format: str, + *mapper_outputs: List[Block], + partial_reduce: bool = False, + ) -> Tuple[Block, BlockMetadata]: + normalized_blocks = TableBlockAccessor.normalize_block_types( + mapper_outputs, normalize_type=batch_format + ) + return BlockAccessor.for_block(normalized_blocks[0]).merge_sorted_blocks( + normalized_blocks, sort_key + ) + + @staticmethod + def sample_boundaries( + blocks: List[ObjectRef[Block]], + sort_key: SortKey, + num_reducers: int, + sample_bar: Optional[ProgressBar] = None, + ) -> List[T]: + """ + Return (num_reducers - 1) items in ascending order from the blocks that + partition the domain into ranges with approximately equally many elements. + Each boundary item is a tuple of a form (col1_value, col2_value, ...). + """ + columns = sort_key.get_columns() + n_samples = int(num_reducers * 10 / len(blocks)) + + sample_block = cached_remote_fn(_sample_block) + + sample_results = [ + sample_block.remote(block, n_samples, sort_key) for block in blocks + ] + if sample_bar is None: + sample_bar = ProgressBar( + SortTaskSpec.SORT_SAMPLE_SUB_PROGRESS_BAR_NAME, + len(blocks) * n_samples, + unit="rows", + ) + # TODO(zhilong): Update sort sample bar before finished. + samples = sample_bar.fetch_until_complete(sample_results) + del sample_results + samples: List[Block] = [s for s in samples if len(s) > 0] + # The dataset is empty + if len(samples) == 0: + return [None] * (num_reducers - 1) + + # Convert samples to a sorted list[tuple[...]] where each tuple represents a + # sample. + # TODO: Once we deprecate pandas blocks, we can avoid this conversion and + # directly sort the samples. + builder = DelegatingBlockBuilder() + for sample in samples: + builder.add_block(sample) + samples_table = builder.build() + samples_dict = BlockAccessor.for_block(samples_table).to_numpy(columns=columns) + # This zip does the transposition from list of column values to list of tuples. + samples_list = list(zip(*samples_dict.values())) + + def is_na(x): + # Check if x is None or NaN. Type casting to np.array first to avoid + # isnan failing on strings and other types. + if x is None: + return True + x = np.asarray(x) + if np.issubdtype(x.dtype, np.number): + return np.isnan(x) + return False + + # To allow multi-directional sort, we utilize Python's stable sort: we + # sort several times with different directions. We do this in reverse, so + # that the last key we sort by is the primary sort key passed by the user. + for i, desc in list(enumerate(sort_key.get_descending()))[::-1]: + # Sort the list, but Nones should be NULL_SENTINEL to ensure safe sorting. + samples_list.sort( + key=lambda sample: NULL_SENTINEL if is_na(sample[i]) else sample[i], + reverse=desc, + ) + + # Each boundary corresponds to a quantile of the data. + quantile_indices = [ + int(q * (len(samples_list) - 1)) + for q in np.linspace(0, 1, num_reducers + 1) + ] + # Exclude the first and last quantiles because they're 0 and 1. + return [samples_list[i] for i in quantile_indices[1:-1]] + + +def _sample_block(block: Block, n_samples: int, sort_key: SortKey) -> Block: + return BlockAccessor.for_block(block).sample(n_samples, sort_key) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..cc21f699667b48f64ef6d08da493b37f1b3d0583 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py @@ -0,0 +1,138 @@ +from typing import Any, Dict, List, Optional, Tuple + +import ray +from ray.data._internal.execution.interfaces import RefBundle, TaskContext +from ray.data._internal.planner.exchange.interfaces import ExchangeTaskScheduler +from ray.data._internal.planner.exchange.shuffle_task_spec import ShuffleTaskSpec +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.split import _split_at_indices +from ray.data._internal.stats import StatsDict +from ray.data.block import Block, BlockAccessor, BlockMetadata +from ray.types import ObjectRef + + +class SplitRepartitionTaskScheduler(ExchangeTaskScheduler): + """ + The split (non-shuffle) repartition scheduler. + + First, we calculate global splits needed to produce `output_num_blocks` blocks. + After the split blocks are generated accordingly, reduce tasks are scheduled + to combine split blocks together. + """ + + def execute( + self, + refs: List[RefBundle], + output_num_blocks: int, + ctx: TaskContext, + map_ray_remote_args: Optional[Dict[str, Any]] = None, + reduce_ray_remote_args: Optional[Dict[str, Any]] = None, + ) -> Tuple[List[RefBundle], StatsDict]: + input_num_rows = 0 + input_owned_by_consumer = True + for ref_bundle in refs: + block_num_rows = ref_bundle.num_rows() + if block_num_rows is None: + raise ValueError( + "Cannot split partition on blocks with unknown number of rows." + ) + input_num_rows += block_num_rows + if not ref_bundle.owns_blocks: + input_owned_by_consumer = False + + # Compute the (output_num_blocks) indices needed for an equal split of the + # input blocks. When output_num_blocks=1, the total number of + # input rows is used as the end index during the split calculation, + # so that we can combine all input blocks into a single output block. + indices = [] + if output_num_blocks == 1: + indices = [input_num_rows] + else: + cur_idx = 0 + for _ in range(output_num_blocks - 1): + cur_idx += input_num_rows / output_num_blocks + indices.append(int(cur_idx)) + assert len(indices) <= output_num_blocks, (indices, output_num_blocks) + + if map_ray_remote_args is None: + map_ray_remote_args = {} + if reduce_ray_remote_args is None: + reduce_ray_remote_args = {} + if "scheduling_strategy" not in reduce_ray_remote_args: + reduce_ray_remote_args = reduce_ray_remote_args.copy() + reduce_ray_remote_args["scheduling_strategy"] = "SPREAD" + + blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]] = [] + for ref_bundle in refs: + blocks_with_metadata.extend(ref_bundle.blocks) + split_return = _split_at_indices( + blocks_with_metadata, indices, input_owned_by_consumer + ) + split_block_refs, split_metadata = [], [] + for b, m in zip(*split_return): + split_block_refs.append(b) + split_metadata.extend(m) + + sub_progress_bar_dict = ctx.sub_progress_bar_dict + bar_name = ShuffleTaskSpec.SPLIT_REPARTITION_SUB_PROGRESS_BAR_NAME + assert bar_name in sub_progress_bar_dict, sub_progress_bar_dict + reduce_bar = sub_progress_bar_dict[bar_name] + + reduce_task = cached_remote_fn(self._exchange_spec.reduce) + reduce_return = [ + reduce_task.options(**reduce_ray_remote_args, num_returns=2).remote( + *self._exchange_spec._reduce_args, + *split_block_refs[j], + ) + for j in range(output_num_blocks) + # Only process splits which contain blocks. + if len(split_block_refs[j]) > 0 + ] + + reduce_block_refs, reduce_metadata = zip(*reduce_return) + reduce_metadata = reduce_bar.fetch_until_complete(list(reduce_metadata)) + reduce_block_refs, reduce_metadata = list(reduce_block_refs), list( + reduce_metadata + ) + + # Handle empty blocks. + if len(reduce_block_refs) < output_num_blocks: + import pyarrow as pa + + from ray.data._internal.arrow_block import ArrowBlockBuilder + from ray.data._internal.pandas_block import ( + PandasBlockBuilder, + PandasBlockSchema, + ) + + num_empty_blocks = output_num_blocks - len(reduce_block_refs) + first_block_schema = reduce_metadata[0].schema + if first_block_schema is None: + raise ValueError( + "Cannot split partition on blocks with unknown block format." + ) + elif isinstance(first_block_schema, pa.Schema): + builder = ArrowBlockBuilder() + elif isinstance(first_block_schema, PandasBlockSchema): + builder = PandasBlockBuilder() + empty_block = builder.build() + empty_meta = BlockAccessor.for_block(empty_block).get_metadata( + exec_stats=None + ) # No stats for empty block. + empty_block_refs, empty_metadata = zip( + *[(ray.put(empty_block), empty_meta) for _ in range(num_empty_blocks)] + ) + reduce_block_refs.extend(empty_block_refs) + reduce_metadata.extend(empty_metadata) + + output = [] + for block, meta in zip(reduce_block_refs, reduce_metadata): + output.append( + RefBundle([(block, meta)], owns_blocks=input_owned_by_consumer) + ) + stats = { + "split": split_metadata, + "reduce": reduce_metadata, + } + + return (output, stats) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_all_to_all_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_all_to_all_op.py new file mode 100644 index 0000000000000000000000000000000000000000..7677ab6a0fc00d90aea7cae22a9ff882c093b140 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_all_to_all_op.py @@ -0,0 +1,94 @@ +from typing import List + +from ray.data._internal.execution.interfaces import PhysicalOperator +from ray.data._internal.execution.operators.base_physical_operator import ( + AllToAllOperator, +) +from ray.data._internal.logical.operators.all_to_all_operator import ( + AbstractAllToAll, + Aggregate, + RandomizeBlocks, + RandomShuffle, + Repartition, + Sort, +) +from ray.data._internal.planner.aggregate import generate_aggregate_fn +from ray.data._internal.planner.random_shuffle import generate_random_shuffle_fn +from ray.data._internal.planner.randomize_blocks import generate_randomize_blocks_fn +from ray.data._internal.planner.repartition import generate_repartition_fn +from ray.data._internal.planner.sort import generate_sort_fn +from ray.data.context import DataContext + + +def plan_all_to_all_op( + op: AbstractAllToAll, + physical_children: List[PhysicalOperator], + data_context: DataContext, +) -> AllToAllOperator: + """Get the corresponding physical operators DAG for AbstractAllToAll operators. + + Note this method only converts the given `op`, but not its input dependencies. + See Planner.plan() for more details. + """ + assert len(physical_children) == 1 + input_physical_dag = physical_children[0] + + target_max_block_size = None + if isinstance(op, RandomizeBlocks): + fn = generate_randomize_blocks_fn(op) + # Randomize block order does not actually compute anything, so we + # want to inherit the upstream op's target max block size. + elif isinstance(op, RandomShuffle): + debug_limit_shuffle_execution_to_num_blocks = data_context.get_config( + "debug_limit_shuffle_execution_to_num_blocks", None + ) + fn = generate_random_shuffle_fn( + op._seed, + op._num_outputs, + op._ray_remote_args, + debug_limit_shuffle_execution_to_num_blocks, + ) + target_max_block_size = data_context.target_shuffle_max_block_size + elif isinstance(op, Repartition): + debug_limit_shuffle_execution_to_num_blocks = None + if op._shuffle: + target_max_block_size = data_context.target_shuffle_max_block_size + debug_limit_shuffle_execution_to_num_blocks = data_context.get_config( + "debug_limit_shuffle_execution_to_num_blocks", None + ) + fn = generate_repartition_fn( + op._num_outputs, + op._shuffle, + debug_limit_shuffle_execution_to_num_blocks, + ) + elif isinstance(op, Sort): + debug_limit_shuffle_execution_to_num_blocks = data_context.get_config( + "debug_limit_shuffle_execution_to_num_blocks", None + ) + fn = generate_sort_fn( + op._sort_key, op._batch_format, debug_limit_shuffle_execution_to_num_blocks + ) + target_max_block_size = data_context.target_shuffle_max_block_size + elif isinstance(op, Aggregate): + debug_limit_shuffle_execution_to_num_blocks = data_context.get_config( + "debug_limit_shuffle_execution_to_num_blocks", None + ) + fn = generate_aggregate_fn( + op._key, + op._aggs, + op._batch_format, + debug_limit_shuffle_execution_to_num_blocks, + ) + target_max_block_size = data_context.target_shuffle_max_block_size + else: + raise ValueError(f"Found unknown logical operator during planning: {op}") + + return AllToAllOperator( + fn, + input_physical_dag, + data_context, + target_max_block_size=target_max_block_size, + num_outputs=op._num_outputs, + sub_progress_bar_names=op._sub_progress_bar_names, + name=op.name, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_from_arrow_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_from_arrow_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_from_numpy_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_from_numpy_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py new file mode 100644 index 0000000000000000000000000000000000000000..feea97b52ab99f604aaa8de34dad1fb667d12f99 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py @@ -0,0 +1,648 @@ +import asyncio +import collections +import inspect +import queue +from threading import Thread +from types import GeneratorType +from typing import Any, Callable, Iterable, Iterator, List, Optional + +import numpy as np +import pandas as pd +import pyarrow as pa + +import ray +from ray._private.utils import get_or_create_event_loop +from ray.data._internal.compute import get_compute +from ray.data._internal.execution.interfaces import PhysicalOperator +from ray.data._internal.execution.interfaces.task_context import TaskContext +from ray.data._internal.execution.operators.map_operator import MapOperator +from ray.data._internal.execution.operators.map_transformer import ( + BatchMapTransformFn, + BlockMapTransformFn, + BlocksToBatchesMapTransformFn, + BlocksToRowsMapTransformFn, + BuildOutputBlocksMapTransformFn, + MapTransformCallable, + MapTransformer, + Row, + RowMapTransformFn, +) +from ray.data._internal.execution.util import make_callable_class_concurrent +from ray.data._internal.logical.operators.map_operator import ( + AbstractUDFMap, + Filter, + FlatMap, + MapBatches, + MapRows, + Project, +) +from ray.data._internal.numpy_support import is_valid_udf_return +from ray.data._internal.util import _truncated_repr +from ray.data.block import ( + Block, + BlockAccessor, + BlockType, + CallableClass, + DataBatch, + UserDefinedFunction, +) +from ray.data.context import DataContext +from ray.data.exceptions import UserCodeException +from ray.util.rpdb import _is_ray_debugger_post_mortem_enabled + + +class _MapActorContext: + def __init__( + self, + udf_map_cls: UserDefinedFunction, + udf_map_fn: Callable[[Any], Any], + is_async: bool, + ): + self.udf_map_cls = udf_map_cls + self.udf_map_fn = udf_map_fn + self.is_async = is_async + self.udf_map_asyncio_loop = None + self.udf_map_asyncio_thread = None + + if is_async: + self._init_async() + + def _init_async(self): + # Only used for callable class with async generator `__call__` method. + loop = get_or_create_event_loop() + + def run_loop(): + asyncio.set_event_loop(loop) + loop.run_forever() + + thread = Thread(target=run_loop) + thread.start() + self.udf_map_asyncio_loop = loop + self.udf_map_asyncio_thread = thread + + +def plan_project_op( + op: Project, + physical_children: List[PhysicalOperator], + data_context: DataContext, +) -> MapOperator: + assert len(physical_children) == 1 + input_physical_dag = physical_children[0] + + columns = op.cols + columns_rename = op.cols_rename + + def fn(block: Block) -> Block: + try: + if BlockAccessor.for_block(block).block_type() == BlockType.PANDAS: + # TODO (srinathk) PandasBlockAccessor combine method needs to handle + # None types correctly. Until then, convert to Arrow Table. + block = BlockAccessor.for_block(block).to_arrow() + if not BlockAccessor.for_block(block).num_rows(): + return block + if columns: + block = BlockAccessor.for_block(block).select(columns) + if columns_rename: + block = block.rename_columns( + [columns_rename.get(col, col) for col in block.schema.names] + ) + return block + except Exception as e: + _handle_debugger_exception(e) + + compute = get_compute(op._compute) + transform_fn = _generate_transform_fn_for_map_block(fn) + map_transformer = _create_map_transformer_for_block_based_map_op( + transform_fn, + ) + + return MapOperator.create( + map_transformer, + input_physical_dag, + data_context, + name=op.name, + compute_strategy=compute, + ray_remote_args=op._ray_remote_args, + ray_remote_args_fn=op._ray_remote_args_fn, + ) + + +def plan_filter_op( + op: Filter, + physical_children: List[PhysicalOperator], + data_context: DataContext, +) -> MapOperator: + assert len(physical_children) == 1 + input_physical_dag = physical_children[0] + + expression = op._filter_expr + compute = get_compute(op._compute) + if expression is not None: + + def filter_batch_fn(block: "pa.Table") -> "pa.Table": + try: + return block.filter(expression) + except Exception as e: + _handle_debugger_exception(e) + + transform_fn = _generate_transform_fn_for_map_batches(filter_batch_fn) + map_transformer = _create_map_transformer_for_map_batches_op( + transform_fn, + batch_size=None, + batch_format="pyarrow", + zero_copy_batch=True, + ) + else: + filter_fn, init_fn = _parse_op_fn(op) + transform_fn = _generate_transform_fn_for_filter(filter_fn) + map_transformer = _create_map_transformer_for_row_based_map_op( + transform_fn, init_fn + ) + + return MapOperator.create( + map_transformer, + input_physical_dag, + data_context, + name=op.name, + compute_strategy=compute, + ray_remote_args=op._ray_remote_args, + ray_remote_args_fn=op._ray_remote_args_fn, + ) + + +def plan_udf_map_op( + op: AbstractUDFMap, + physical_children: List[PhysicalOperator], + data_context: DataContext, +) -> MapOperator: + """Get the corresponding physical operators DAG for AbstractUDFMap operators. + + Note this method only converts the given `op`, but not its input dependencies. + See Planner.plan() for more details. + """ + assert len(physical_children) == 1 + input_physical_dag = physical_children[0] + + compute = get_compute(op._compute) + fn, init_fn = _parse_op_fn(op) + + if isinstance(op, MapBatches): + transform_fn = _generate_transform_fn_for_map_batches(fn) + map_transformer = _create_map_transformer_for_map_batches_op( + transform_fn, + op._batch_size, + op._batch_format, + op._zero_copy_batch, + init_fn, + ) + else: + if isinstance(op, MapRows): + transform_fn = _generate_transform_fn_for_map_rows(fn) + elif isinstance(op, FlatMap): + transform_fn = _generate_transform_fn_for_flat_map(fn) + else: + raise ValueError(f"Found unknown logical operator during planning: {op}") + + map_transformer = _create_map_transformer_for_row_based_map_op( + transform_fn, init_fn + ) + + return MapOperator.create( + map_transformer, + input_physical_dag, + data_context, + name=op.name, + target_max_block_size=None, + compute_strategy=compute, + min_rows_per_bundle=op._min_rows_per_bundled_input, + ray_remote_args_fn=op._ray_remote_args_fn, + ray_remote_args=op._ray_remote_args, + ) + + +def _parse_op_fn(op: AbstractUDFMap): + # Note, it's important to define these standalone variables. + # So the parsed functions won't need to caputure the entire operator, which may not + # be serializable. + op_fn = op._fn + fn_args = op._fn_args or () + fn_kwargs = op._fn_kwargs or {} + + if isinstance(op._fn, CallableClass): + fn_constructor_args = op._fn_constructor_args or () + fn_constructor_kwargs = op._fn_constructor_kwargs or {} + + is_async_gen = inspect.isasyncgenfunction(op._fn.__call__) + + # TODO(scottjlee): (1) support non-generator async functions + # (2) make the map actor async + if not is_async_gen: + op_fn = make_callable_class_concurrent(op_fn) + + def init_fn(): + if ray.data._map_actor_context is None: + ray.data._map_actor_context = _MapActorContext( + udf_map_cls=op_fn, + udf_map_fn=op_fn( + *fn_constructor_args, + **fn_constructor_kwargs, + ), + is_async=is_async_gen, + ) + + if is_async_gen: + + async def fn(item: Any) -> Any: + assert ray.data._map_actor_context is not None + assert ray.data._map_actor_context.is_async + + try: + return ray.data._map_actor_context.udf_map_fn( + item, + *fn_args, + **fn_kwargs, + ) + except Exception as e: + _handle_debugger_exception(e) + + else: + + def fn(item: Any) -> Any: + assert ray.data._map_actor_context is not None + assert not ray.data._map_actor_context.is_async + try: + return ray.data._map_actor_context.udf_map_fn( + item, + *fn_args, + **fn_kwargs, + ) + except Exception as e: + _handle_debugger_exception(e) + + else: + + def fn(item: Any) -> Any: + try: + return op_fn(item, *fn_args, **fn_kwargs) + except Exception as e: + _handle_debugger_exception(e) + + def init_fn(): + pass + + return fn, init_fn + + +def _handle_debugger_exception(e: Exception): + """If the Ray Debugger is enabled, keep the full stack trace unmodified + so that the debugger can stop at the initial unhandled exception. + Otherwise, clear the stack trace to omit noisy internal code path.""" + ctx = ray.data.DataContext.get_current() + if _is_ray_debugger_post_mortem_enabled() or ctx.raise_original_map_exception: + raise e + else: + raise UserCodeException() from e + + +# Following are util functions for converting UDFs to `MapTransformCallable`s. + + +def _validate_batch_output(batch: Block) -> None: + if not isinstance( + batch, + ( + list, + pa.Table, + np.ndarray, + collections.abc.Mapping, + pd.core.frame.DataFrame, + dict, + ), + ): + raise ValueError( + "The `fn` you passed to `map_batches` returned a value of type " + f"{type(batch)}. This isn't allowed -- `map_batches` expects " + "`fn` to return a `pandas.DataFrame`, `pyarrow.Table`, " + "`numpy.ndarray`, `list`, or `dict[str, numpy.ndarray]`." + ) + + if isinstance(batch, list): + raise ValueError( + f"Error validating {_truncated_repr(batch)}: " + "Returning a list of objects from `map_batches` is not " + "allowed in Ray 2.5. To return Python objects, " + "wrap them in a named dict field, e.g., " + "return `{'results': objects}` instead of just `objects`." + ) + + if isinstance(batch, collections.abc.Mapping): + for key, value in list(batch.items()): + if not is_valid_udf_return(value): + raise ValueError( + f"Error validating {_truncated_repr(batch)}: " + "The `fn` you passed to `map_batches` returned a " + f"`dict`. `map_batches` expects all `dict` values " + f"to be `list` or `np.ndarray` type, but the value " + f"corresponding to key {key!r} is of type " + f"{type(value)}. To fix this issue, convert " + f"the {type(value)} to a `np.ndarray`." + ) + + +def _generate_transform_fn_for_map_batches( + fn: UserDefinedFunction, +) -> MapTransformCallable[DataBatch, DataBatch]: + if inspect.iscoroutinefunction(fn): + # UDF is a callable class with async generator `__call__` method. + transform_fn = _generate_transform_fn_for_async_map_batches(fn) + + else: + + def transform_fn( + batches: Iterable[DataBatch], _: TaskContext + ) -> Iterable[DataBatch]: + for batch in batches: + try: + if ( + not isinstance(batch, collections.abc.Mapping) + and BlockAccessor.for_block(batch).num_rows() == 0 + ): + # For empty input blocks, we directly ouptut them without + # calling the UDF. + # TODO(hchen): This workaround is because some all-to-all + # operators output empty blocks with no schema. + res = [batch] + else: + res = fn(batch) + if not isinstance(res, GeneratorType): + res = [res] + except ValueError as e: + read_only_msgs = [ + "assignment destination is read-only", + "buffer source array is read-only", + ] + err_msg = str(e) + if any(msg in err_msg for msg in read_only_msgs): + raise ValueError( + f"Batch mapper function {fn.__name__} tried to mutate a " + "zero-copy read-only batch. To be able to mutate the " + "batch, pass zero_copy_batch=False to map_batches(); " + "this will create a writable copy of the batch before " + "giving it to fn. To elide this copy, modify your mapper " + "function so it doesn't try to mutate its input." + ) from e + else: + raise e from None + else: + for out_batch in res: + _validate_batch_output(out_batch) + yield out_batch + + return transform_fn + + +def _generate_transform_fn_for_async_map_batches( + fn: UserDefinedFunction, +) -> MapTransformCallable[DataBatch, DataBatch]: + def transform_fn( + input_iterable: Iterable[DataBatch], _: TaskContext + ) -> Iterable[DataBatch]: + # Use a queue to store outputs from async generator calls. + # We will put output batches into this queue from async + # generators, and in the main event loop, yield them from + # the queue as they become available. + output_batch_queue = queue.Queue() + # Sentinel object to signal the end of the async generator. + sentinel = object() + + async def process_batch(batch: DataBatch): + try: + output_batch_iterator = await fn(batch) + # As soon as results become available from the async generator, + # put them into the result queue so they can be yielded. + async for output_batch in output_batch_iterator: + output_batch_queue.put(output_batch) + except Exception as e: + output_batch_queue.put( + e + ) # Put the exception into the queue to signal an error + + async def process_all_batches(): + try: + loop = ray.data._map_actor_context.udf_map_asyncio_loop + tasks = [loop.create_task(process_batch(x)) for x in input_iterable] + + ctx = ray.data.DataContext.get_current() + if ctx.execution_options.preserve_order: + for task in tasks: + await task() + else: + for task in asyncio.as_completed(tasks): + await task + finally: + output_batch_queue.put(sentinel) + + # Use the existing event loop to create and run Tasks to process each batch + loop = ray.data._map_actor_context.udf_map_asyncio_loop + asyncio.run_coroutine_threadsafe(process_all_batches(), loop) + + # Yield results as they become available. + while True: + # Here, `out_batch` is a one-row output batch + # from the async generator, corresponding to a + # single row from the input batch. + out_batch = output_batch_queue.get() + if out_batch is sentinel: + # Break out of the loop when the sentinel is received. + break + if isinstance(out_batch, Exception): + raise out_batch + _validate_batch_output(out_batch) + yield out_batch + + return transform_fn + + +def _validate_row_output(item): + if not isinstance(item, collections.abc.Mapping): + raise ValueError( + f"Error validating {_truncated_repr(item)}: " + "Standalone Python objects are not " + "allowed in Ray 2.5. To return Python objects from map(), " + "wrap them in a dict, e.g., " + "return `{'item': item}` instead of just `item`." + ) + + +def _generate_transform_fn_for_map_rows( + fn: UserDefinedFunction, +) -> MapTransformCallable[Row, Row]: + def transform_fn(rows: Iterable[Row], _: TaskContext) -> Iterable[Row]: + for row in rows: + out_row = fn(row) + _validate_row_output(out_row) + yield out_row + + return transform_fn + + +def _generate_transform_fn_for_flat_map( + fn: UserDefinedFunction, +) -> MapTransformCallable[Row, Row]: + def transform_fn(rows: Iterable[Row], _: TaskContext) -> Iterable[Row]: + for row in rows: + for out_row in fn(row): + _validate_row_output(out_row) + yield out_row + + return transform_fn + + +def _generate_transform_fn_for_filter( + fn: UserDefinedFunction, +) -> MapTransformCallable[Row, Row]: + def transform_fn(rows: Iterable[Row], _: TaskContext) -> Iterable[Row]: + for row in rows: + if fn(row): + yield row + + return transform_fn + + +def _generate_transform_fn_for_map_block( + fn: UserDefinedFunction, +) -> MapTransformCallable[Block, Block]: + def transform_fn(blocks: Iterable[Block], _: TaskContext) -> Iterable[Block]: + for block in blocks: + out_block = fn(block) + yield out_block + + return transform_fn + + +# Following are util functions for creating `MapTransformer`s. + + +def _create_map_transformer_for_map_batches_op( + batch_fn: MapTransformCallable[DataBatch, DataBatch], + batch_size: Optional[int] = None, + batch_format: str = "default", + zero_copy_batch: bool = False, + init_fn: Optional[Callable[[], None]] = None, +) -> MapTransformer: + """Create a MapTransformer for a map_batches operator.""" + transform_fns = [ + # Convert input blocks to batches. + BlocksToBatchesMapTransformFn( + batch_size=batch_size, + batch_format=batch_format, + zero_copy_batch=zero_copy_batch, + ), + # Apply the UDF. + BatchMapTransformFn(batch_fn, is_udf=True), + # Convert output batches to blocks. + BuildOutputBlocksMapTransformFn.for_batches(), + ] + return MapTransformer(transform_fns, init_fn) + + +def _create_map_transformer_for_row_based_map_op( + row_fn: MapTransformCallable[Row, Row], + init_fn: Optional[Callable[[], None]] = None, +) -> MapTransformer: + """Create a MapTransformer for a row-based map operator + (e.g. map, flat_map, filter).""" + transform_fns = [ + # Convert input blocks to rows. + BlocksToRowsMapTransformFn.instance(), + # Apply the UDF. + RowMapTransformFn(row_fn, is_udf=True), + # Convert output rows to blocks. + BuildOutputBlocksMapTransformFn.for_rows(), + ] + return MapTransformer(transform_fns, init_fn=init_fn) + + +def _create_map_transformer_for_block_based_map_op( + block_fn: MapTransformCallable[Block, Block], + init_fn: Optional[Callable[[], None]] = None, +) -> MapTransformer: + """Create a MapTransformer for a block-based map operator.""" + transform_fns = [ + # Apply the UDF. + BlockMapTransformFn(block_fn), + BuildOutputBlocksMapTransformFn.for_blocks(), + ] + return MapTransformer(transform_fns, init_fn=init_fn) + + +# Following are util functions for the legacy code path. + + +def generate_map_rows_fn( + target_max_block_size: int, +) -> Callable[[Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block]]: + """Generate function to apply the UDF to each record of blocks.""" + + def fn( + blocks: Iterator[Block], + ctx: TaskContext, + row_fn: UserDefinedFunction, + ) -> Iterator[Block]: + transform_fn = _generate_transform_fn_for_map_rows(row_fn) + map_transformer = _create_map_transformer_for_row_based_map_op(transform_fn) + map_transformer.set_target_max_block_size(target_max_block_size) + yield from map_transformer.apply_transform(blocks, ctx) + + return fn + + +def generate_flat_map_fn( + target_max_block_size: int, +) -> Callable[[Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block]]: + """Generate function to apply the UDF to each record of blocks, + and then flatten results. + """ + + def fn( + blocks: Iterator[Block], + ctx: TaskContext, + row_fn: UserDefinedFunction, + ) -> Iterator[Block]: + transform_fn = _generate_transform_fn_for_flat_map(row_fn) + map_transformer = _create_map_transformer_for_row_based_map_op(transform_fn) + map_transformer.set_target_max_block_size(target_max_block_size) + yield from map_transformer.apply_transform(blocks, ctx) + + return fn + + +def generate_map_batches_fn( + target_max_block_size: int, + batch_size: Optional[int] = None, + batch_format: str = "default", + zero_copy_batch: bool = False, +) -> Callable[[Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block]]: + """Generate function to apply the batch UDF to blocks.""" + + def fn( + blocks: Iterable[Block], + ctx: TaskContext, + batch_fn: UserDefinedFunction, + *fn_args, + **fn_kwargs, + ) -> Iterator[Block]: + def _batch_fn(batch): + return batch_fn(batch, *fn_args, **fn_kwargs) + + transform_fn = _generate_transform_fn_for_map_batches(_batch_fn) + map_transformer = _create_map_transformer_for_map_batches_op( + transform_fn, + batch_size, + batch_format, + zero_copy_batch, + ) + map_transformer.set_target_max_block_size(target_max_block_size) + yield from map_transformer.apply_transform(blocks, ctx) + + return fn diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_write_op.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_write_op.py new file mode 100644 index 0000000000000000000000000000000000000000..8efeda39973f23f1f46a0d47d3f3e706fcbf9020 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/plan_write_op.py @@ -0,0 +1,110 @@ +import itertools +from typing import Callable, Iterator, List, Union + +from pandas import DataFrame + +from ray.data._internal.compute import TaskPoolStrategy +from ray.data._internal.execution.interfaces import PhysicalOperator +from ray.data._internal.execution.interfaces.task_context import TaskContext +from ray.data._internal.execution.operators.map_operator import MapOperator +from ray.data._internal.execution.operators.map_transformer import ( + BlockMapTransformFn, + MapTransformer, +) +from ray.data._internal.logical.operators.write_operator import Write +from ray.data.block import Block, BlockAccessor +from ray.data.context import DataContext +from ray.data.datasource.datasink import Datasink, WriteResult +from ray.data.datasource.datasource import Datasource + + +def gen_datasink_write_result( + write_result_blocks: List[Block], +) -> WriteResult: + assert all( + isinstance(block, DataFrame) and len(block) == 1 + for block in write_result_blocks + ) + total_num_rows = sum(result["num_rows"].sum() for result in write_result_blocks) + total_size_bytes = sum(result["size_bytes"].sum() for result in write_result_blocks) + + write_returns = [result["write_return"][0] for result in write_result_blocks] + return WriteResult(total_num_rows, total_size_bytes, write_returns) + + +def generate_write_fn( + datasink_or_legacy_datasource: Union[Datasink, Datasource], **write_args +) -> Callable[[Iterator[Block], TaskContext], Iterator[Block]]: + def fn(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]: + """Writes the blocks to the given datasink or legacy datasource. + + Outputs the original blocks to be written.""" + # Create a copy of the iterator, so we can return the original blocks. + it1, it2 = itertools.tee(blocks, 2) + if isinstance(datasink_or_legacy_datasource, Datasink): + ctx.kwargs["_datasink_write_return"] = datasink_or_legacy_datasource.write( + it1, ctx + ) + else: + datasink_or_legacy_datasource.write(it1, ctx, **write_args) + + return it2 + + return fn + + +def generate_collect_write_stats_fn() -> ( + Callable[[Iterator[Block], TaskContext], Iterator[Block]] +): + # If the write op succeeds, the resulting Dataset is a list of + # one Block which contain stats/metrics about the write. + # Otherwise, an error will be raised. The Datasource can handle + # execution outcomes with `on_write_complete()`` and `on_write_failed()``. + def fn(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]: + """Handles stats collection for block writes.""" + block_accessors = [BlockAccessor.for_block(block) for block in blocks] + total_num_rows = sum(ba.num_rows() for ba in block_accessors) + total_size_bytes = sum(ba.size_bytes() for ba in block_accessors) + + # NOTE: Write tasks can return anything, so we need to wrap it in a valid block + # type. + import pandas as pd + + block = pd.DataFrame( + { + "num_rows": [total_num_rows], + "size_bytes": [total_size_bytes], + "write_return": [ctx.kwargs.get("_datasink_write_return", None)], + } + ) + return iter([block]) + + return fn + + +def plan_write_op( + op: Write, + physical_children: List[PhysicalOperator], + data_context: DataContext, +) -> PhysicalOperator: + assert len(physical_children) == 1 + input_physical_dag = physical_children[0] + + write_fn = generate_write_fn(op._datasink_or_legacy_datasource, **op._write_args) + collect_stats_fn = generate_collect_write_stats_fn() + # Create a MapTransformer for a write operator + transform_fns = [ + BlockMapTransformFn(write_fn), + BlockMapTransformFn(collect_stats_fn), + ] + map_transformer = MapTransformer(transform_fns) + return MapOperator.create( + map_transformer, + input_physical_dag, + data_context, + name="Write", + target_max_block_size=None, + ray_remote_args=op._ray_remote_args, + min_rows_per_bundle=op._min_rows_per_bundled_input, + compute_strategy=TaskPoolStrategy(op._concurrency), + )