koichi12 commited on
Commit
7cace8b
·
verified ·
1 Parent(s): 110275e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. .venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc +3 -0
  3. .venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc +3 -0
  4. .venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py +3 -0
  5. .venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc +0 -0
  8. .venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py +322 -0
  11. .venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py +293 -0
  12. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/__init__.py +0 -0
  13. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc +0 -0
  14. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py +15 -0
  15. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc +0 -0
  16. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py +44 -0
  17. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py +94 -0
  18. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py +188 -0
  19. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py +131 -0
  20. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py +32 -0
  21. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc +0 -0
  22. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc +0 -0
  23. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc +0 -0
  24. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py +28 -0
  25. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py +43 -0
  26. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py +9 -0
  27. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc +0 -0
  28. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc +0 -0
  29. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc +0 -0
  30. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py +62 -0
  31. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py +40 -0
  32. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py +19 -0
  33. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py +2 -0
  34. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py +300 -0
  35. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py +77 -0
  36. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py +574 -0
  37. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py +535 -0
  38. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py +136 -0
  39. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py +44 -0
  40. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py +10 -0
  41. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py +181 -0
  42. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__init__.py +0 -0
  43. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc +0 -0
  44. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc +0 -0
  45. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc +0 -0
  46. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc +0 -0
  47. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc +0 -0
  48. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc +0 -0
  49. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc +0 -0
  50. .venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc +0 -0
.gitattributes CHANGED
@@ -152,3 +152,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
152
  .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
153
  .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
154
  .venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text
 
 
 
152
  .venv/lib/python3.11/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
153
  .venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
154
  .venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text
155
+ .venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
156
+ .venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0b1a74e1674205ec83807b353da73daa79d781531cd64ecbd818fd5438ec680
3
+ size 255996
.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a217bcdb2fd53d64e0014e4fd153627ade902228eadc09fe7df65ee93c07bc05
3
+ size 160644
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from ray.data._internal.block_batching.block_batching import batch_blocks
2
+
3
+ __all__ = ["batch_blocks"]
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (331 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/block_batching.cpython-311.pyc ADDED
Binary file (3 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/interfaces.cpython-311.pyc ADDED
Binary file (2.53 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/iter_batches.cpython-311.pyc ADDED
Binary file (15.1 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/__pycache__/util.cpython-311.pyc ADDED
Binary file (17.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/iter_batches.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ from contextlib import nullcontext
3
+ from typing import Any, Callable, Dict, Iterator, Optional
4
+
5
+ import ray
6
+ from ray.data._internal.block_batching.interfaces import Batch, BlockPrefetcher
7
+ from ray.data._internal.block_batching.util import (
8
+ ActorBlockPrefetcher,
9
+ WaitBlockPrefetcher,
10
+ blocks_to_batches,
11
+ collate,
12
+ extract_data_from_batch,
13
+ finalize_batches,
14
+ format_batches,
15
+ resolve_block_refs,
16
+ )
17
+ from ray.data._internal.execution.interfaces.ref_bundle import RefBundle
18
+ from ray.data._internal.memory_tracing import trace_deallocation
19
+ from ray.data._internal.stats import DatasetStats
20
+ from ray.data._internal.util import make_async_gen
21
+ from ray.data.block import Block, DataBatch
22
+ from ray.data.context import DataContext
23
+ from ray.types import ObjectRef
24
+
25
+
26
+ def iter_batches(
27
+ ref_bundles: Iterator[RefBundle],
28
+ *,
29
+ stats: Optional[DatasetStats] = None,
30
+ clear_block_after_read: bool = False,
31
+ batch_size: Optional[int] = None,
32
+ batch_format: Optional[str] = "default",
33
+ drop_last: bool = False,
34
+ collate_fn: Optional[Callable[[DataBatch], Any]] = None,
35
+ finalize_fn: Optional[Callable[[Any], Any]] = None,
36
+ shuffle_buffer_min_size: Optional[int] = None,
37
+ shuffle_seed: Optional[int] = None,
38
+ ensure_copy: bool = False,
39
+ prefetch_batches: int = 1,
40
+ ) -> Iterator[DataBatch]:
41
+ """Create formatted batches of data from an iterator of block object references and
42
+ corresponding metadata.
43
+
44
+ This takes a block iterator and creates batch_size batches, slicing,
45
+ unioning, shuffling, prefetching, and formatting blocks as needed.
46
+
47
+ The algorithm uses both pipeline parallelism and data parallelism:
48
+
49
+ If prefetch_batches=2, these are all the batches in flight:
50
+
51
+ [User thread] trains on Batch 0
52
+ - [Fetch thread] Batch 1 finalization + move to output queue
53
+ - [Worker thread 1] Batch 2 formatting + collating
54
+ - [Worker thread 2] Batch 3 formatting + collating
55
+ - [Raylet] Batches 4 + 5 fetched to local object store memory
56
+
57
+ At any point in time there are prefetch_batches+1 batches in local heap memory.
58
+ And the next set of prefetch_batches in local object store memory.
59
+
60
+ The actual steps are as follows:
61
+
62
+ In a single async thread, do the following:
63
+ 1. Trigger Ray local prefetching of `prefetch_batches` worth of block object
64
+ references.
65
+ 2. Resolve (i.e. call `ray.get()`) on the block references.
66
+ 3. Perform the necessary batch slicing to construct full batches, possibly
67
+ shuffling if necessary.
68
+ 4. Then, in a threadpool consisting of `prefetch_batches` threads:
69
+ a. Format the batches to the provided batch format.
70
+ b. Apply the collate function.
71
+ 5. Finalize each of the collated batches
72
+ 6. Fetch outputs from the threadpool, maintaining order of the batches.
73
+
74
+ Args:
75
+ ref_bundles: An iterator over RefBundles.
76
+ stats: DatasetStats object to record timing and other statistics.
77
+ clear_block_after_read: Whether to clear the block from object store
78
+ manually (i.e. without waiting for Python's automatic GC) after it
79
+ is read. Doing so will reclaim memory faster and hence reduce the
80
+ memory footprint. However, the caller has to ensure the safety, i.e.
81
+ the block will never be accessed again.
82
+ batch_size: Record batch size, or None to let the system pick.
83
+ batch_format: The format in which to return each batch.
84
+ Specify "default" to use the current block format (promoting
85
+ Arrow to pandas automatically), "pandas" to
86
+ select ``pandas.DataFrame`` or "pyarrow" to select
87
+ ``pyarrow.Table``, or None to use entire blocks
88
+ as batches. Default is "default".
89
+ drop_last: Whether to drop the last batch if it's incomplete.
90
+ collate_fn: A function to apply to each data batch before returning it.
91
+ finalize_fn: A function to apply to each data batch after it has been collated.
92
+ This function is not run in a threadpool so it can be used for
93
+ memory-intensive operations such as GPU preloading.
94
+ shuffle_buffer_min_size: If non-None, the data will be randomly shuffled using a
95
+ local in-memory shuffle buffer, and this value will serve as the minimum
96
+ number of rows that must be in the local in-memory shuffle buffer in order
97
+ to yield a batch.
98
+ shuffle_seed: The seed to use for the local random shuffle.
99
+ ensure_copy: Whether batches are always copied from the underlying base
100
+ blocks (not zero-copy views).
101
+ prefetch_batches: The number of batches to fetch ahead of the current batch to
102
+ process. If set to greater than 0, a separate thread will be used to fetch
103
+ the specified amount of formatted batches from blocks. This improves
104
+ performance for non-CPU bound UDFs, allowing batch fetching compute and
105
+ formatting to be overlapped with the UDF. Defaults to 1.
106
+
107
+ Returns:
108
+ An iterator over record batches.
109
+ """
110
+ context = DataContext.get_current()
111
+
112
+ if (
113
+ prefetch_batches > 0
114
+ and context.actor_prefetcher_enabled
115
+ and not ray.util.client.ray.is_connected()
116
+ ):
117
+ prefetcher = ActorBlockPrefetcher()
118
+ else:
119
+ prefetcher = WaitBlockPrefetcher()
120
+
121
+ eager_free = clear_block_after_read and DataContext.get_current().eager_free
122
+
123
+ def _async_iter_batches(
124
+ ref_bundles: Iterator[RefBundle],
125
+ ) -> Iterator[DataBatch]:
126
+ # Step 1: Prefetch logical batches locally.
127
+ block_iter = prefetch_batches_locally(
128
+ ref_bundles=ref_bundles,
129
+ prefetcher=prefetcher,
130
+ num_batches_to_prefetch=prefetch_batches,
131
+ batch_size=batch_size,
132
+ eager_free=eager_free,
133
+ )
134
+
135
+ # Step 2: Resolve the blocks.
136
+ block_iter = resolve_block_refs(block_ref_iter=block_iter, stats=stats)
137
+
138
+ # Step 3: Batch and shuffle the resolved blocks.
139
+ batch_iter = blocks_to_batches(
140
+ block_iter=block_iter,
141
+ stats=stats,
142
+ batch_size=batch_size,
143
+ drop_last=drop_last,
144
+ shuffle_buffer_min_size=shuffle_buffer_min_size,
145
+ shuffle_seed=shuffle_seed,
146
+ ensure_copy=ensure_copy,
147
+ )
148
+
149
+ # Step 4: Use a threadpool for formatting and collation.
150
+ batch_iter = _format_in_threadpool(
151
+ batch_iter,
152
+ stats=stats,
153
+ batch_format=batch_format,
154
+ collate_fn=collate_fn,
155
+ num_threadpool_workers=prefetch_batches,
156
+ )
157
+
158
+ # Step 5: Finalize each batch.
159
+ if finalize_fn is not None:
160
+ batch_iter = finalize_batches(
161
+ batch_iter, finalize_fn=finalize_fn, stats=stats
162
+ )
163
+
164
+ # Step 6: Restore original order.
165
+ batch_iter: Iterator[Batch] = restore_original_order(batch_iter)
166
+
167
+ yield from extract_data_from_batch(batch_iter)
168
+
169
+ # Run everything in a separate thread to not block the main thread when waiting
170
+ # for streaming results.
171
+ async_batch_iter = make_async_gen(
172
+ ref_bundles, fn=_async_iter_batches, num_workers=1
173
+ )
174
+
175
+ while True:
176
+ with stats.iter_total_blocked_s.timer() if stats else nullcontext():
177
+ try:
178
+ next_batch = next(async_batch_iter)
179
+ except StopIteration:
180
+ break
181
+ with stats.iter_user_s.timer() if stats else nullcontext():
182
+ yield next_batch
183
+
184
+
185
+ def _format_in_threadpool(
186
+ batch_iter: Iterator[Batch],
187
+ stats: DatasetStats,
188
+ batch_format: Optional[str],
189
+ collate_fn: Optional[Callable[[DataBatch], Any]],
190
+ num_threadpool_workers: int,
191
+ ) -> Iterator[Batch]:
192
+ """Executes the batching, formatting, and collation logic in a threadpool.
193
+
194
+ Args:
195
+ logical_batch_iterator: An iterator over logical batches.
196
+ stats: DatasetStats object to record timing and other statistics.
197
+ batch_format: The format in which to return each batch.
198
+ Specify "default" to use the current block format (promoting
199
+ Arrow to pandas automatically), "pandas" to
200
+ select ``pandas.DataFrame`` or "pyarrow" to select
201
+ ``pyarrow.Table``, or None to use entire blocks
202
+ as batches.
203
+ collate_fn: A function to apply to each data batch before returning it.
204
+ num_threadpool_workers: The number of threads to use in the threadpool.
205
+ """
206
+
207
+ def threadpool_computations_format_collate(
208
+ batch_iter: Iterator[Batch],
209
+ ) -> Iterator[Batch]:
210
+ # Step 4a: Format the batches.
211
+ formatted_batch_iter = format_batches(
212
+ batch_iter, batch_format=batch_format, stats=stats
213
+ )
214
+
215
+ # Step 4b: Apply the collate function if applicable.
216
+ if collate_fn is not None:
217
+ formatted_batch_iter = collate(
218
+ formatted_batch_iter, collate_fn=collate_fn, stats=stats
219
+ )
220
+ yield from formatted_batch_iter
221
+
222
+ if num_threadpool_workers > 0:
223
+ collated_iter = make_async_gen(
224
+ base_iterator=batch_iter,
225
+ fn=threadpool_computations_format_collate,
226
+ num_workers=num_threadpool_workers,
227
+ )
228
+ else:
229
+ collated_iter = threadpool_computations_format_collate(batch_iter)
230
+ return collated_iter
231
+
232
+
233
+ def prefetch_batches_locally(
234
+ ref_bundles: Iterator[RefBundle],
235
+ prefetcher: BlockPrefetcher,
236
+ num_batches_to_prefetch: int,
237
+ batch_size: Optional[int],
238
+ eager_free: bool = False,
239
+ ) -> Iterator[ObjectRef[Block]]:
240
+ """Given an iterator of batched RefBundles, returns an iterator over the
241
+ corresponding block references while prefetching `num_batches_to_prefetch`
242
+ batches in advance.
243
+
244
+ Args:
245
+ ref_bundles: An iterator over batched RefBundles.
246
+ prefetcher: The prefetcher to use.
247
+ num_batches_to_prefetch: The number of batches to prefetch ahead of the
248
+ current batch during the scan.
249
+ batch_size: User specified batch size, or None to let the system pick.
250
+ eager_free: Whether to eagerly free the object reference from the object store.
251
+ """
252
+
253
+ sliding_window = collections.deque()
254
+ current_window_size = 0
255
+
256
+ if num_batches_to_prefetch <= 0:
257
+ for ref_bundle in ref_bundles:
258
+ for block_ref in ref_bundle.block_refs:
259
+ yield block_ref
260
+ return
261
+
262
+ if batch_size is not None:
263
+ num_rows_to_prefetch = num_batches_to_prefetch * batch_size
264
+ else:
265
+ num_rows_to_prefetch = None
266
+
267
+ # Create and fetch the initial window.
268
+ # Stop adding if the number of rows in this window is greater than requested
269
+ # batch size, or if the batch size is None and the number of blocks in this window
270
+ # is greater than requested batches to prefetch.
271
+ while (batch_size is not None and current_window_size < num_rows_to_prefetch) or (
272
+ batch_size is None and len(sliding_window) < num_batches_to_prefetch
273
+ ):
274
+ try:
275
+ next_ref_bundle = next(ref_bundles)
276
+ sliding_window.extend(next_ref_bundle.blocks)
277
+ current_window_size += next_ref_bundle.num_rows()
278
+ except StopIteration:
279
+ break
280
+
281
+ prefetcher.prefetch_blocks([block_ref for block_ref, _ in list(sliding_window)])
282
+
283
+ while sliding_window:
284
+ block_ref, metadata = sliding_window.popleft()
285
+ current_window_size -= metadata.num_rows
286
+ if batch_size is None or current_window_size < num_rows_to_prefetch:
287
+ try:
288
+ next_ref_bundle = next(ref_bundles)
289
+ for block_ref_and_md in next_ref_bundle.blocks:
290
+ sliding_window.append(block_ref_and_md)
291
+ current_window_size += block_ref_and_md[1].num_rows
292
+ prefetcher.prefetch_blocks(
293
+ [block_ref for block_ref, _ in list(sliding_window)]
294
+ )
295
+ except StopIteration:
296
+ pass
297
+ yield block_ref
298
+ trace_deallocation(block_ref, loc="iter_batches", free=eager_free)
299
+ prefetcher.stop()
300
+
301
+
302
+ def restore_original_order(batch_iter: Iterator[Batch]) -> Iterator[Batch]:
303
+ """Restores the original order of the provided `batch_iter`
304
+
305
+ This function will yield items from `base_iterator` in the correct order based on
306
+ each batch's batch_idx. All indexes are expected to be unique.
307
+
308
+ `batch_iter` is expected to not have any missing indexes. All indexes from 0 to len
309
+ (base_iterator) must be present.
310
+ """
311
+ next_index_required = 0
312
+ buffer: Dict[int, Batch] = {}
313
+ for batch in batch_iter:
314
+ assert batch.batch_idx not in buffer
315
+ buffer[batch.batch_idx] = batch
316
+ while next_index_required in buffer:
317
+ yield buffer.pop(next_index_required)
318
+ next_index_required += 1
319
+
320
+ while next_index_required in buffer:
321
+ yield buffer.pop(next_index_required)
322
+ next_index_required += 1
.venv/lib/python3.11/site-packages/ray/data/_internal/block_batching/util.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import threading
3
+ from contextlib import nullcontext
4
+ from typing import Any, Callable, Iterator, List, Optional, Tuple
5
+
6
+ import ray
7
+ from ray.actor import ActorHandle
8
+ from ray.data._internal.batcher import Batcher, ShufflingBatcher
9
+ from ray.data._internal.block_batching.interfaces import (
10
+ Batch,
11
+ BlockPrefetcher,
12
+ CollatedBatch,
13
+ )
14
+ from ray.data._internal.stats import DatasetStats
15
+ from ray.data.block import Block, BlockAccessor, DataBatch
16
+ from ray.types import ObjectRef
17
+ from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _calculate_ref_hits(refs: List[ObjectRef[Any]]) -> Tuple[int, int, int]:
23
+ """Given a list of object references, returns how many are already on the local
24
+ node, how many require fetching from another node, and how many have unknown
25
+ locations. If `DataContext.get_current().enable_get_object_locations_for_metrics` is
26
+ False, this will return `(-1, -1, -1)` as getting object locations is disabled."""
27
+ current_node_id = ray.get_runtime_context().get_node_id()
28
+
29
+ ctx = ray.data.context.DataContext.get_current()
30
+ if ctx.enable_get_object_locations_for_metrics:
31
+ locs = ray.experimental.get_object_locations(refs)
32
+ nodes: List[List[str]] = [loc["node_ids"] for loc in locs.values()]
33
+ hits = sum(current_node_id in node_ids for node_ids in nodes)
34
+ unknowns = sum(1 for node_ids in nodes if not node_ids)
35
+ misses = len(nodes) - hits - unknowns
36
+ return hits, misses, unknowns
37
+
38
+ return -1, -1, -1
39
+
40
+
41
+ def resolve_block_refs(
42
+ block_ref_iter: Iterator[ObjectRef[Block]],
43
+ stats: Optional[DatasetStats] = None,
44
+ ) -> Iterator[Block]:
45
+ """Resolves the block references for each logical batch.
46
+
47
+ Args:
48
+ block_ref_iter: An iterator over block object references.
49
+ stats: An optional stats object to recording block hits and misses.
50
+ """
51
+ hits = 0
52
+ misses = 0
53
+ unknowns = 0
54
+
55
+ for block_ref in block_ref_iter:
56
+ current_hit, current_miss, current_unknown = _calculate_ref_hits([block_ref])
57
+ hits += current_hit
58
+ misses += current_miss
59
+ unknowns += current_unknown
60
+
61
+ # TODO(amogkam): Optimized further by batching multiple references in a single
62
+ # `ray.get()` call.
63
+ with stats.iter_get_s.timer() if stats else nullcontext():
64
+ block = ray.get(block_ref)
65
+ yield block
66
+
67
+ if stats:
68
+ stats.iter_blocks_local = hits
69
+ stats.iter_blocks_remote = misses
70
+ stats.iter_unknown_location = unknowns
71
+
72
+
73
+ def blocks_to_batches(
74
+ block_iter: Iterator[Block],
75
+ stats: Optional[DatasetStats] = None,
76
+ batch_size: Optional[int] = None,
77
+ drop_last: bool = False,
78
+ shuffle_buffer_min_size: Optional[int] = None,
79
+ shuffle_seed: Optional[int] = None,
80
+ ensure_copy: bool = False,
81
+ ) -> Iterator[Batch]:
82
+ """Given an iterator over blocks, returns an iterator over blocks
83
+ of the appropriate bacth size.
84
+
85
+ If the shuffling configurations are specified, then the
86
+ output blocks contain shuffled data.
87
+
88
+ Args:
89
+ block_iter: An iterator over blocks.
90
+ stats: Dataset stats object used to store block batching time.
91
+ batch_size: Record batch size, or None to let the system pick.
92
+ drop_last: Whether to drop the last batch if it's incomplete.
93
+ shuffle_buffer_min_size: If non-None, the data will be randomly shuffled
94
+ using a local in-memory shuffle buffer, and this value will serve as the
95
+ minimum number of rows that must be in the local in-memory shuffle buffer in
96
+ order to yield a batch.
97
+ shuffle_seed: The seed to use for the local random shuffle.
98
+ ensure_copy: Whether batches are always copied from the underlying base
99
+ blocks (not zero-copy views).
100
+
101
+ Returns:
102
+ An iterator over blocks of the given size that are potentially shuffled.
103
+ """
104
+ if shuffle_buffer_min_size is not None:
105
+ batcher = ShufflingBatcher(
106
+ batch_size=batch_size,
107
+ shuffle_buffer_min_size=shuffle_buffer_min_size,
108
+ shuffle_seed=shuffle_seed,
109
+ )
110
+ else:
111
+ batcher = Batcher(batch_size=batch_size, ensure_copy=ensure_copy)
112
+
113
+ def get_iter_next_batch_s_timer():
114
+ return stats.iter_next_batch_s.timer() if stats else nullcontext()
115
+
116
+ global_counter = 0
117
+
118
+ for block in block_iter:
119
+ batcher.add(block)
120
+ while batcher.has_batch():
121
+ with get_iter_next_batch_s_timer():
122
+ batch = batcher.next_batch()
123
+ yield Batch(global_counter, batch)
124
+ global_counter += 1
125
+
126
+ # Signal to the batcher that there are no more blocks to add.
127
+ batcher.done_adding()
128
+
129
+ # Get any leftover batches in ShufflingBatcher.
130
+ while batcher.has_batch():
131
+ with get_iter_next_batch_s_timer():
132
+ batch = batcher.next_batch()
133
+ yield Batch(global_counter, batch)
134
+ global_counter += 1
135
+
136
+ # Get any remaining data.
137
+ if not drop_last and batcher.has_any():
138
+ with get_iter_next_batch_s_timer():
139
+ batch = batcher.next_batch()
140
+ yield Batch(global_counter, batch)
141
+ global_counter += 1
142
+
143
+
144
+ def format_batches(
145
+ block_iter: Iterator[Batch],
146
+ batch_format: Optional[str],
147
+ stats: Optional[DatasetStats] = None,
148
+ ) -> Iterator[Batch]:
149
+ """Given an iterator of blocks, returns an iterator of formatted batches.
150
+
151
+ Args:
152
+ block_iter: An iterator over blocks.
153
+ batch_format: The batch format to use.
154
+ stats: An optional stats object to record formatting times.
155
+
156
+ Returns:
157
+ An iterator over batch index and the formatted batch.
158
+ """
159
+ for batch in block_iter:
160
+ with stats.iter_format_batch_s.timer() if stats else nullcontext():
161
+ formatted_batch = BlockAccessor.for_block(batch.data).to_batch_format(
162
+ batch_format
163
+ )
164
+ yield Batch(batch.batch_idx, formatted_batch)
165
+
166
+
167
+ def collate(
168
+ batch_iter: Iterator[Batch],
169
+ collate_fn: Optional[Callable[[DataBatch], Any]],
170
+ stats: Optional[DatasetStats] = None,
171
+ ) -> Iterator[CollatedBatch]:
172
+ """Returns an iterator with the provided collate_fn applied to items of the batch
173
+ iterator.
174
+
175
+ Args:
176
+ batch_iter: An iterator over formatted batches.
177
+ collate_fn: A function to apply to each batch.
178
+ stats: An optional stats object to record formatting times.
179
+ """
180
+ for batch in batch_iter:
181
+ with stats.iter_collate_batch_s.timer() if stats else nullcontext():
182
+ collated_batch = collate_fn(batch.data)
183
+ yield CollatedBatch(batch.batch_idx, collated_batch)
184
+
185
+
186
+ def finalize_batches(
187
+ batch_iter: Iterator[CollatedBatch],
188
+ finalize_fn: Callable[[Any], Any],
189
+ stats: Optional[DatasetStats] = None,
190
+ ) -> Iterator[CollatedBatch]:
191
+ """Returns an iterator with the provided finalize_fn applied to items of the batch
192
+ iterator.
193
+
194
+ This is the same as `collate` except the input batches can be of type Any.
195
+
196
+ Args:
197
+ batch_iter: An iterator over processed batches.
198
+ finalize_fn: A function to apply to each batch.
199
+ stats: An optional stats object to record formatting times.
200
+
201
+ Returns:
202
+ An iterator over batch index and the finalized batch.
203
+ """
204
+ for batch in batch_iter:
205
+ with stats.iter_finalize_batch_s.timer() if stats else nullcontext():
206
+ finalized_batch = finalize_fn(batch.data)
207
+ yield CollatedBatch(batch.batch_idx, finalized_batch)
208
+
209
+
210
+ def extract_data_from_batch(batch_iter: Iterator[Batch]) -> Iterator[Any]:
211
+ for batch in batch_iter:
212
+ yield batch.data
213
+
214
+
215
+ PREFETCHER_ACTOR_NAMESPACE = "ray.dataset"
216
+
217
+
218
+ class WaitBlockPrefetcher(BlockPrefetcher):
219
+ """Block prefetcher using ray.wait."""
220
+
221
+ def __init__(self):
222
+ self._blocks = []
223
+ self._stopped = False
224
+ self._condition = threading.Condition()
225
+ self._thread = threading.Thread(
226
+ target=self._run,
227
+ name="Prefetcher",
228
+ daemon=True,
229
+ )
230
+ self._thread.start()
231
+
232
+ def _run(self):
233
+ while True:
234
+ try:
235
+ blocks_to_wait = []
236
+ with self._condition:
237
+ if len(self._blocks) > 0:
238
+ blocks_to_wait, self._blocks = self._blocks[:], []
239
+ else:
240
+ if self._stopped:
241
+ return
242
+ blocks_to_wait = []
243
+ self._condition.wait()
244
+ if len(blocks_to_wait) > 0:
245
+ ray.wait(blocks_to_wait, num_returns=1, fetch_local=True)
246
+ except Exception:
247
+ logger.exception("Error in prefetcher thread.")
248
+
249
+ def prefetch_blocks(self, blocks: List[ObjectRef[Block]]):
250
+ with self._condition:
251
+ if self._stopped:
252
+ raise RuntimeError("Prefetcher is stopped.")
253
+ self._blocks = blocks
254
+ self._condition.notify()
255
+
256
+ def stop(self):
257
+ with self._condition:
258
+ if self._stopped:
259
+ return
260
+ self._stopped = True
261
+ self._condition.notify()
262
+
263
+ def __del__(self):
264
+ self.stop()
265
+
266
+
267
+ class ActorBlockPrefetcher(BlockPrefetcher):
268
+ """Block prefetcher using a local actor."""
269
+
270
+ def __init__(self):
271
+ self.prefetch_actor = self._get_or_create_actor_prefetcher()
272
+
273
+ @staticmethod
274
+ def _get_or_create_actor_prefetcher() -> "ActorHandle":
275
+ node_id = ray.get_runtime_context().get_node_id()
276
+ actor_name = f"dataset-block-prefetcher-{node_id}"
277
+ return _BlockPretcher.options(
278
+ scheduling_strategy=NodeAffinitySchedulingStrategy(node_id, soft=False),
279
+ name=actor_name,
280
+ namespace=PREFETCHER_ACTOR_NAMESPACE,
281
+ get_if_exists=True,
282
+ ).remote()
283
+
284
+ def prefetch_blocks(self, blocks: List[ObjectRef[Block]]):
285
+ self.prefetch_actor.prefetch.remote(*blocks)
286
+
287
+
288
+ @ray.remote(num_cpus=0)
289
+ class _BlockPretcher:
290
+ """Helper actor that prefetches blocks asynchronously."""
291
+
292
+ def prefetch(self, *blocks) -> None:
293
+ pass
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (201 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .autoscaler import Autoscaler
2
+ from .autoscaling_actor_pool import AutoscalingActorPool
3
+ from .default_autoscaler import DefaultAutoscaler
4
+
5
+
6
+ def create_autoscaler(topology, resource_manager, execution_id):
7
+ return DefaultAutoscaler(topology, resource_manager, execution_id)
8
+
9
+
10
+ __all__ = [
11
+ "Autoscaler",
12
+ "DefaultAutoscaler",
13
+ "create_autoscaler",
14
+ "AutoscalingActorPool",
15
+ ]
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/__pycache__/autoscaling_actor_pool.cpython-311.pyc ADDED
Binary file (4.8 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaler.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import TYPE_CHECKING
3
+
4
+ from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
5
+ from ray.util.annotations import DeveloperAPI
6
+
7
+ if TYPE_CHECKING:
8
+ from ray.data._internal.execution.resource_manager import ResourceManager
9
+ from ray.data._internal.execution.streaming_executor_state import Topology
10
+
11
+
12
+ @DeveloperAPI
13
+ class Autoscaler(ABC):
14
+ """Abstract interface for Ray Data autoscaler."""
15
+
16
+ def __init__(
17
+ self,
18
+ topology: "Topology",
19
+ resource_manager: "ResourceManager",
20
+ execution_id: str,
21
+ ):
22
+ self._topology = topology
23
+ self._resource_manager = resource_manager
24
+ self._execution_id = execution_id
25
+
26
+ @abstractmethod
27
+ def try_trigger_scaling(self):
28
+ """Try trigger autoscaling.
29
+
30
+ This method will be called each time when StreamingExecutor makes
31
+ a scheduling decision. A subclass should override this method to
32
+ handle the autoscaling of both the cluster and `AutoscalingActorPool`s.
33
+ """
34
+ ...
35
+
36
+ @abstractmethod
37
+ def on_executor_shutdown(self):
38
+ """Callback when the StreamingExecutor is shutting down."""
39
+ ...
40
+
41
+ @abstractmethod
42
+ def get_total_resources(self) -> ExecutionResources:
43
+ """Get the total resources that are available to this data execution."""
44
+ ...
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/autoscaling_actor_pool.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
4
+ from ray.util.annotations import DeveloperAPI
5
+
6
+
7
+ @DeveloperAPI
8
+ class AutoscalingActorPool(ABC):
9
+ """Abstract interface of an autoscaling actor pool.
10
+
11
+ A `PhysicalOperator` can manage one or more `AutoscalingActorPool`s.
12
+ `Autoscaler` is responsible for deciding autoscaling of these actor
13
+ pools.
14
+ """
15
+
16
+ @abstractmethod
17
+ def min_size(self) -> int:
18
+ """Min size of the actor pool."""
19
+ ...
20
+
21
+ @abstractmethod
22
+ def max_size(self) -> int:
23
+ """Max size of the actor pool."""
24
+ ...
25
+
26
+ @abstractmethod
27
+ def current_size(self) -> int:
28
+ """Current size of the actor pool."""
29
+ ...
30
+
31
+ @abstractmethod
32
+ def num_running_actors(self) -> int:
33
+ """Number of running actors."""
34
+ ...
35
+
36
+ @abstractmethod
37
+ def num_active_actors(self) -> int:
38
+ """Number of actors with at least one active task."""
39
+ ...
40
+
41
+ @abstractmethod
42
+ def num_pending_actors(self) -> int:
43
+ """Number of actors pending creation."""
44
+ ...
45
+
46
+ @abstractmethod
47
+ def max_tasks_in_flight_per_actor(self) -> int:
48
+ """Max number of in-flight tasks per actor."""
49
+ ...
50
+
51
+ @abstractmethod
52
+ def current_in_flight_tasks(self) -> int:
53
+ """Number of current in-flight tasks."""
54
+ ...
55
+
56
+ def num_total_task_slots(self) -> int:
57
+ """Total number of task slots."""
58
+ return self.max_tasks_in_flight_per_actor() * self.current_size()
59
+
60
+ def num_free_task_slots(self) -> int:
61
+ """Number of free slots to run tasks."""
62
+ return (
63
+ self.max_tasks_in_flight_per_actor() * self.current_size()
64
+ - self.current_in_flight_tasks()
65
+ )
66
+
67
+ @abstractmethod
68
+ def scale_up(self, num_actors: int) -> int:
69
+ """Request the actor pool to scale up by the given number of actors.
70
+
71
+ The number of actually added actors may be less than the requested
72
+ number.
73
+
74
+ Returns:
75
+ The number of actors actually added.
76
+ """
77
+ ...
78
+
79
+ @abstractmethod
80
+ def scale_down(self, num_actors: int) -> int:
81
+ """Request actor pool to scale down by the given number of actors.
82
+
83
+ The number of actually removed actors may be less than the requested
84
+ number.
85
+
86
+ Returns:
87
+ The number of actors actually removed.
88
+ """
89
+ ...
90
+
91
+ @abstractmethod
92
+ def per_actor_resource_usage(self) -> ExecutionResources:
93
+ """Per actor resource usage."""
94
+ ...
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaler/default_autoscaler.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import time
3
+ from typing import TYPE_CHECKING, Dict
4
+
5
+ import ray
6
+ from .autoscaler import Autoscaler
7
+ from .autoscaling_actor_pool import AutoscalingActorPool
8
+ from ray.data._internal.execution.autoscaling_requester import (
9
+ get_or_create_autoscaling_requester_actor,
10
+ )
11
+ from ray.data._internal.execution.interfaces.execution_options import ExecutionResources
12
+
13
+ if TYPE_CHECKING:
14
+ from ray.data._internal.execution.interfaces import PhysicalOperator
15
+ from ray.data._internal.execution.resource_manager import ResourceManager
16
+ from ray.data._internal.execution.streaming_executor_state import OpState, Topology
17
+
18
+
19
+ class DefaultAutoscaler(Autoscaler):
20
+
21
+ # Default threshold of actor pool utilization to trigger scaling up.
22
+ DEFAULT_ACTOR_POOL_SCALING_UP_THRESHOLD: float = 0.8
23
+ # Default threshold of actor pool utilization to trigger scaling down.
24
+ DEFAULT_ACTOR_POOL_SCALING_DOWN_THRESHOLD: float = 0.5
25
+
26
+ # Min number of seconds between two autoscaling requests.
27
+ MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS = 20
28
+
29
+ def __init__(
30
+ self,
31
+ topology: "Topology",
32
+ resource_manager: "ResourceManager",
33
+ execution_id: str,
34
+ actor_pool_scaling_up_threshold: float = DEFAULT_ACTOR_POOL_SCALING_UP_THRESHOLD, # noqa: E501
35
+ actor_pool_scaling_down_threshold: float = DEFAULT_ACTOR_POOL_SCALING_DOWN_THRESHOLD, # noqa: E501
36
+ ):
37
+ self._actor_pool_scaling_up_threshold = actor_pool_scaling_up_threshold
38
+ self._actor_pool_scaling_down_threshold = actor_pool_scaling_down_threshold
39
+ # Last time when a request was sent to Ray's autoscaler.
40
+ self._last_request_time = 0
41
+ super().__init__(topology, resource_manager, execution_id)
42
+
43
+ def try_trigger_scaling(self):
44
+ self._try_scale_up_cluster()
45
+ self._try_scale_up_or_down_actor_pool()
46
+
47
+ def _calculate_actor_pool_util(self, actor_pool: AutoscalingActorPool):
48
+ """Calculate the utilization of the given actor pool."""
49
+ if actor_pool.current_size() == 0:
50
+ return 0
51
+ else:
52
+ return actor_pool.num_active_actors() / actor_pool.current_size()
53
+
54
+ def _actor_pool_should_scale_up(
55
+ self,
56
+ actor_pool: AutoscalingActorPool,
57
+ op: "PhysicalOperator",
58
+ op_state: "OpState",
59
+ ):
60
+ # Do not scale up, if the op is completed or no more inputs are coming.
61
+ if op.completed() or (op._inputs_complete and op.internal_queue_size() == 0):
62
+ return False
63
+ if actor_pool.current_size() < actor_pool.min_size():
64
+ # Scale up, if the actor pool is below min size.
65
+ return True
66
+ elif actor_pool.current_size() >= actor_pool.max_size():
67
+ # Do not scale up, if the actor pool is already at max size.
68
+ return False
69
+ # Do not scale up, if the op does not have more resources.
70
+ if not op_state._scheduling_status.under_resource_limits:
71
+ return False
72
+ # Do not scale up, if the op has enough free slots for the existing inputs.
73
+ if op_state.num_queued() <= actor_pool.num_free_task_slots():
74
+ return False
75
+ # Determine whether to scale up based on the actor pool utilization.
76
+ util = self._calculate_actor_pool_util(actor_pool)
77
+ return util > self._actor_pool_scaling_up_threshold
78
+
79
+ def _actor_pool_should_scale_down(
80
+ self,
81
+ actor_pool: AutoscalingActorPool,
82
+ op: "PhysicalOperator",
83
+ ):
84
+ # Scale down, if the op is completed or no more inputs are coming.
85
+ if op.completed() or (op._inputs_complete and op.internal_queue_size() == 0):
86
+ return True
87
+ if actor_pool.current_size() > actor_pool.max_size():
88
+ # Scale down, if the actor pool is above max size.
89
+ return True
90
+ elif actor_pool.current_size() <= actor_pool.min_size():
91
+ # Do not scale down, if the actor pool is already at min size.
92
+ return False
93
+ # Determine whether to scale down based on the actor pool utilization.
94
+ util = self._calculate_actor_pool_util(actor_pool)
95
+ return util < self._actor_pool_scaling_down_threshold
96
+
97
+ def _try_scale_up_or_down_actor_pool(self):
98
+ for op, state in self._topology.items():
99
+ actor_pools = op.get_autoscaling_actor_pools()
100
+ for actor_pool in actor_pools:
101
+ while True:
102
+ # Try to scale up or down the actor pool.
103
+ should_scale_up = self._actor_pool_should_scale_up(
104
+ actor_pool,
105
+ op,
106
+ state,
107
+ )
108
+ should_scale_down = self._actor_pool_should_scale_down(
109
+ actor_pool, op
110
+ )
111
+ if should_scale_up and not should_scale_down:
112
+ if actor_pool.scale_up(1) == 0:
113
+ break
114
+ elif should_scale_down and not should_scale_up:
115
+ if actor_pool.scale_down(1) == 0:
116
+ break
117
+ else:
118
+ break
119
+
120
+ def _try_scale_up_cluster(self):
121
+ """Try to scale up the cluster to accomodate the provided in-progress workload.
122
+
123
+ This makes a resource request to Ray's autoscaler consisting of the current,
124
+ aggregate usage of all operators in the DAG + the incremental usage of all
125
+ operators that are ready for dispatch (i.e. that have inputs queued). If the
126
+ autoscaler were to grant this resource request, it would allow us to dispatch
127
+ one task for every ready operator.
128
+
129
+ Note that this resource request does not take the global resource limits or the
130
+ liveness policy into account; it only tries to make the existing resource usage
131
+ + one more task per ready operator feasible in the cluster.
132
+ """
133
+ # Limit the frequency of autoscaling requests.
134
+ now = time.time()
135
+ if now - self._last_request_time < self.MIN_GAP_BETWEEN_AUTOSCALING_REQUESTS:
136
+ return
137
+
138
+ # Scale up the cluster, if no ops are allowed to run, but there are still data
139
+ # in the input queues.
140
+ no_runnable_op = all(
141
+ op_state._scheduling_status.runnable is False
142
+ for _, op_state in self._topology.items()
143
+ )
144
+ any_has_input = any(
145
+ op_state.num_queued() > 0 for _, op_state in self._topology.items()
146
+ )
147
+ if not (no_runnable_op and any_has_input):
148
+ return
149
+
150
+ self._last_request_time = now
151
+
152
+ # Get resource usage for all ops + additional resources needed to launch one
153
+ # more task for each ready op.
154
+ resource_request = []
155
+
156
+ def to_bundle(resource: ExecutionResources) -> Dict:
157
+ req = {}
158
+ if resource.cpu:
159
+ req["CPU"] = math.ceil(resource.cpu)
160
+ if resource.gpu:
161
+ req["GPU"] = math.ceil(resource.gpu)
162
+ return req
163
+
164
+ for op, state in self._topology.items():
165
+ per_task_resource = op.incremental_resource_usage()
166
+ task_bundle = to_bundle(per_task_resource)
167
+ resource_request.extend([task_bundle] * op.num_active_tasks())
168
+ # Only include incremental resource usage for ops that are ready for
169
+ # dispatch.
170
+ if state.num_queued() > 0:
171
+ # TODO(Clark): Scale up more aggressively by adding incremental resource
172
+ # usage for more than one bundle in the queue for this op?
173
+ resource_request.append(task_bundle)
174
+
175
+ self._send_resource_request(resource_request)
176
+
177
+ def _send_resource_request(self, resource_request):
178
+ # Make autoscaler resource request.
179
+ actor = get_or_create_autoscaling_requester_actor()
180
+ actor.request_resources.remote(resource_request, self._execution_id)
181
+
182
+ def on_executor_shutdown(self):
183
+ # Make request for zero resources to autoscaler for this execution.
184
+ actor = get_or_create_autoscaling_requester_actor()
185
+ actor.request_resources.remote({}, self._execution_id)
186
+
187
+ def get_total_resources(self) -> ExecutionResources:
188
+ return ExecutionResources.from_resource_dict(ray.cluster_resources())
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/autoscaling_requester.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import threading
3
+ import time
4
+ from typing import Dict, List
5
+
6
+ import ray
7
+ from ray.data.context import DataContext
8
+ from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
9
+
10
+ # Resource requests are considered stale after this number of seconds, and
11
+ # will be purged.
12
+ RESOURCE_REQUEST_TIMEOUT = 60
13
+ PURGE_INTERVAL = RESOURCE_REQUEST_TIMEOUT * 2
14
+
15
+ # When the autoscaling is driven by memory pressure and there are abundant
16
+ # CPUs to support incremental CPUs needed to launch more tasks, we'll translate
17
+ # memory pressure into an artificial request of CPUs. The amount of CPUs we'll
18
+ # request is ARTIFICIAL_CPU_SCALING_FACTOR * ray.cluster_resources()["CPU"].
19
+ ARTIFICIAL_CPU_SCALING_FACTOR = 1.2
20
+
21
+
22
+ @ray.remote(num_cpus=0, max_restarts=-1, max_task_retries=-1)
23
+ class AutoscalingRequester:
24
+ """Actor to make resource requests to autoscaler for the datasets.
25
+
26
+ The resource requests are set to timeout after RESOURCE_REQUEST_TIMEOUT seconds.
27
+ For those live requests, we keep track of the last request made for each execution,
28
+ which overrides all previous requests it made; then sum the requested amounts
29
+ across all executions as the final request to the autoscaler.
30
+ """
31
+
32
+ def __init__(self):
33
+ # execution_id -> (List[Dict], expiration timestamp)
34
+ self._resource_requests = {}
35
+ # TTL for requests.
36
+ self._timeout = RESOURCE_REQUEST_TIMEOUT
37
+
38
+ self._self_handle = ray.get_runtime_context().current_actor
39
+
40
+ # Start a thread to purge expired requests periodically.
41
+ def purge_thread_run():
42
+ while True:
43
+ time.sleep(PURGE_INTERVAL)
44
+ # Call purge_expired_requests() as an actor task,
45
+ # so we don't need to handle multi-threading.
46
+ ray.get(self._self_handle.purge_expired_requests.remote())
47
+
48
+ self._purge_thread = threading.Thread(target=purge_thread_run, daemon=True)
49
+ self._purge_thread.start()
50
+
51
+ def purge_expired_requests(self):
52
+ self._purge()
53
+ ray.autoscaler.sdk.request_resources(bundles=self._aggregate_requests())
54
+
55
+ def request_resources(self, req: List[Dict], execution_id: str):
56
+ # Purge expired requests before making request to autoscaler.
57
+ self._purge()
58
+ # For the same execution_id, we track the latest resource request and
59
+ # the its expiration timestamp.
60
+ self._resource_requests[execution_id] = (
61
+ req,
62
+ time.time() + self._timeout,
63
+ )
64
+ # We aggregate the resource requests across all execution_id's to Ray
65
+ # autoscaler.
66
+ ray.autoscaler.sdk.request_resources(bundles=self._aggregate_requests())
67
+
68
+ def _purge(self):
69
+ # Purge requests that are stale.
70
+ now = time.time()
71
+ for k, (_, t) in list(self._resource_requests.items()):
72
+ if t < now:
73
+ self._resource_requests.pop(k)
74
+
75
+ def _aggregate_requests(self) -> List[Dict]:
76
+ req = []
77
+ for _, (r, _) in self._resource_requests.items():
78
+ req.extend(r)
79
+
80
+ def get_cpus(req):
81
+ num_cpus = 0
82
+ for r in req:
83
+ if "CPU" in r:
84
+ num_cpus += r["CPU"]
85
+ return num_cpus
86
+
87
+ # Round up CPUs to exceed total cluster CPUs so it can actually upscale.
88
+ # This is to handle the issue where the autoscaling is driven by memory
89
+ # pressure (rather than CPUs) from streaming executor. In such case, simply
90
+ # asking for incremental CPUs (e.g. 1 CPU for each ready operator) may not
91
+ # actually be able to trigger autoscaling if existing CPUs in cluster can
92
+ # already satisfy the incremental CPUs request.
93
+ num_cpus = get_cpus(req)
94
+ if num_cpus > 0:
95
+ total = ray.cluster_resources()
96
+ if "CPU" in total and num_cpus <= total["CPU"]:
97
+ delta = (
98
+ math.ceil(ARTIFICIAL_CPU_SCALING_FACTOR * total["CPU"]) - num_cpus
99
+ )
100
+ req.extend([{"CPU": 1}] * delta)
101
+
102
+ return req
103
+
104
+ def _test_set_timeout(self, ttl):
105
+ """Set the timeout. This is for test only"""
106
+ self._timeout = ttl
107
+
108
+
109
+ # Creating/getting an actor from multiple threads is not safe.
110
+ # https://github.com/ray-project/ray/issues/41324
111
+ _autoscaling_requester_lock: threading.RLock = threading.RLock()
112
+
113
+
114
+ def get_or_create_autoscaling_requester_actor():
115
+ ctx = DataContext.get_current()
116
+ scheduling_strategy = ctx.scheduling_strategy
117
+ # Pin the stats actor to the local node so it fate-shares with the driver.
118
+ # Note: for Ray Client, the ray.get_runtime_context().get_node_id() should
119
+ # point to the head node.
120
+ scheduling_strategy = NodeAffinitySchedulingStrategy(
121
+ ray.get_runtime_context().get_node_id(),
122
+ soft=False,
123
+ )
124
+ with _autoscaling_requester_lock:
125
+ return AutoscalingRequester.options(
126
+ name="AutoscalingRequester",
127
+ namespace="AutoscalingRequester",
128
+ get_if_exists=True,
129
+ lifetime="detached",
130
+ scheduling_strategy=scheduling_strategy,
131
+ ).remote()
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ import ray
4
+ from .backpressure_policy import BackpressurePolicy
5
+ from .concurrency_cap_backpressure_policy import ConcurrencyCapBackpressurePolicy
6
+
7
+ if TYPE_CHECKING:
8
+ from ray.data._internal.execution.streaming_executor_state import Topology
9
+
10
+ # Default enabled backpressure policies and its config key.
11
+ # Use `DataContext.set_config` to config it.
12
+ ENABLED_BACKPRESSURE_POLICIES = [
13
+ ConcurrencyCapBackpressurePolicy,
14
+ ]
15
+ ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY = "backpressure_policies.enabled"
16
+
17
+
18
+ def get_backpressure_policies(topology: "Topology"):
19
+ data_context = ray.data.DataContext.get_current()
20
+ policies = data_context.get_config(
21
+ ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY, ENABLED_BACKPRESSURE_POLICIES
22
+ )
23
+
24
+ return [policy(topology) for policy in policies]
25
+
26
+
27
+ __all__ = [
28
+ "BackpressurePolicy",
29
+ "ConcurrencyCapBackpressurePolicy",
30
+ "ENABLED_BACKPRESSURE_POLICIES_CONFIG_KEY",
31
+ "get_backpressure_policies",
32
+ ]
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.5 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/backpressure_policy.cpython-311.pyc ADDED
Binary file (1.81 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/__pycache__/concurrency_cap_backpressure_policy.cpython-311.pyc ADDED
Binary file (2.68 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/backpressure_policy.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from ray.data._internal.execution.interfaces.physical_operator import (
6
+ PhysicalOperator,
7
+ )
8
+ from ray.data._internal.execution.streaming_executor_state import Topology
9
+
10
+
11
+ class BackpressurePolicy(ABC):
12
+ """Interface for back pressure policies."""
13
+
14
+ @abstractmethod
15
+ def __init__(self, topology: "Topology"):
16
+ ...
17
+
18
+ def can_add_input(self, op: "PhysicalOperator") -> bool:
19
+ """Determine if we can add a new input to the operator. If returns False, the
20
+ operator will be backpressured and will not be able to run new tasks.
21
+ Used in `streaming_executor_state.py::select_operator_to_run()`.
22
+
23
+ Returns: True if we can add a new input to the operator, False otherwise.
24
+
25
+ Note, if multiple backpressure policies are enabled, the operator will be
26
+ backpressured if any of the policies returns False.
27
+ """
28
+ return True
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/backpressure_policy/concurrency_cap_backpressure_policy.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import TYPE_CHECKING
3
+
4
+ from .backpressure_policy import BackpressurePolicy
5
+ from ray.data._internal.execution.operators.task_pool_map_operator import (
6
+ TaskPoolMapOperator,
7
+ )
8
+
9
+ if TYPE_CHECKING:
10
+ from ray.data._internal.execution.interfaces.physical_operator import (
11
+ PhysicalOperator,
12
+ )
13
+ from ray.data._internal.execution.streaming_executor_state import Topology
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class ConcurrencyCapBackpressurePolicy(BackpressurePolicy):
19
+ """A backpressure policy that caps the concurrency of each operator.
20
+
21
+ The policy will limit the number of concurrently running tasks based on its
22
+ concurrency cap parameter.
23
+
24
+ NOTE: Only support setting concurrency cap for `TaskPoolMapOperator` for now.
25
+ TODO(chengsu): Consolidate with actor scaling logic of `ActorPoolMapOperator`.
26
+ """
27
+
28
+ def __init__(self, topology: "Topology"):
29
+ self._concurrency_caps: dict["PhysicalOperator", float] = {}
30
+
31
+ for op, _ in topology.items():
32
+ if isinstance(op, TaskPoolMapOperator) and op.get_concurrency() is not None:
33
+ self._concurrency_caps[op] = op.get_concurrency()
34
+ else:
35
+ self._concurrency_caps[op] = float("inf")
36
+
37
+ logger.debug(
38
+ "ConcurrencyCapBackpressurePolicy initialized with: "
39
+ f"{self._concurrency_caps}"
40
+ )
41
+
42
+ def can_add_input(self, op: "PhysicalOperator") -> bool:
43
+ return op.metrics.num_tasks_running < self._concurrency_caps[op]
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .bundle_queue import BundleQueue
2
+ from .fifo_bundle_queue import FIFOBundleQueue
3
+
4
+
5
+ def create_bundle_queue() -> BundleQueue:
6
+ return FIFOBundleQueue()
7
+
8
+
9
+ __all__ = ["BundleQueue", "create_bundle_queue"]
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (572 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/bundle_queue.cpython-311.pyc ADDED
Binary file (3.06 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/__pycache__/fifo_bundle_queue.cpython-311.pyc ADDED
Binary file (5.87 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/bundle_queue/bundle_queue.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ from ray.data._internal.execution.interfaces import RefBundle
6
+
7
+
8
+ class BundleQueue(abc.ABC):
9
+ @abc.abstractmethod
10
+ def __len__(self) -> int:
11
+ """Return the number of bundles in the queue."""
12
+ ...
13
+
14
+ @abc.abstractmethod
15
+ def __contains__(self, bundle: "RefBundle") -> bool:
16
+ """Return whether the bundle is in the queue."""
17
+ ...
18
+
19
+ @abc.abstractmethod
20
+ def add(self, bundle: "RefBundle") -> None:
21
+ """Add a bundle to the queue."""
22
+ ...
23
+
24
+ @abc.abstractmethod
25
+ def pop(self) -> "RefBundle":
26
+ """Remove and return the head of the queue.
27
+
28
+ Raises:
29
+ IndexError: If the queue is empty.
30
+ """
31
+ ...
32
+
33
+ @abc.abstractmethod
34
+ def peek(self) -> Optional["RefBundle"]:
35
+ """Return the head of the queue without removing it.
36
+
37
+ If the queue is empty, return `None`.
38
+ """
39
+ ...
40
+
41
+ @abc.abstractmethod
42
+ def remove(self, bundle: "RefBundle"):
43
+ """Remove a bundle from the queue."""
44
+ ...
45
+
46
+ @abc.abstractmethod
47
+ def clear(self):
48
+ """Remove all bundles from the queue."""
49
+ ...
50
+
51
+ @abc.abstractmethod
52
+ def estimate_size_bytes(self) -> int:
53
+ """Return an estimate of the total size of objects in the queue."""
54
+ ...
55
+
56
+ @abc.abstractmethod
57
+ def is_empty(self):
58
+ """Return whether this queue and all of its internal data structures are empty.
59
+
60
+ This method is used for testing.
61
+ """
62
+ ...
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/execution_callback.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from ray.data.context import DataContext
4
+
5
+ EXECUTION_CALLBACKS_CONFIG_KEY = "execution_callbacks"
6
+
7
+
8
+ class ExecutionCallback:
9
+ """Callback interface for execution events."""
10
+
11
+ def before_execution_starts(self):
12
+ """Called before the Dataset execution starts."""
13
+ ...
14
+
15
+ def after_execution_succeeds(self):
16
+ """Called after the Dataset execution succeeds."""
17
+ ...
18
+
19
+ def after_execution_fails(self, error: Exception):
20
+ """Called after the Dataset execution fails."""
21
+ ...
22
+
23
+
24
+ def get_execution_callbacks(context: DataContext) -> List[ExecutionCallback]:
25
+ """Get all ExecutionCallbacks from the DataContext."""
26
+ return context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
27
+
28
+
29
+ def add_execution_callback(callback: ExecutionCallback, context: DataContext):
30
+ """Add an ExecutionCallback to the DataContext."""
31
+ execution_callbacks = context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
32
+ execution_callbacks.append(callback)
33
+ context.set_config(EXECUTION_CALLBACKS_CONFIG_KEY, execution_callbacks)
34
+
35
+
36
+ def remove_execution_callback(callback: ExecutionCallback, context: DataContext):
37
+ """Remove an ExecutionCallback from the DataContext."""
38
+ execution_callbacks = context.get_config(EXECUTION_CALLBACKS_CONFIG_KEY, [])
39
+ execution_callbacks.remove(callback)
40
+ context.set_config(EXECUTION_CALLBACKS_CONFIG_KEY, execution_callbacks)
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .common import NodeIdStr
2
+ from .execution_options import ExecutionOptions, ExecutionResources
3
+ from .executor import Executor, OutputIterator
4
+ from .physical_operator import PhysicalOperator
5
+ from .ref_bundle import RefBundle
6
+ from .task_context import TaskContext
7
+ from .transform_fn import AllToAllTransformFn
8
+
9
+ __all__ = [
10
+ "AllToAllTransformFn",
11
+ "ExecutionOptions",
12
+ "ExecutionResources",
13
+ "Executor",
14
+ "NodeIdStr",
15
+ "OutputIterator",
16
+ "PhysicalOperator",
17
+ "RefBundle",
18
+ "TaskContext",
19
+ ]
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/common.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Node id string returned by `ray.get_runtime_context().get_node_id()`.
2
+ NodeIdStr = str
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/execution_options.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List, Optional, Union
3
+
4
+ from .common import NodeIdStr
5
+ from ray.data._internal.execution.util import memory_string
6
+ from ray.util.annotations import DeveloperAPI
7
+
8
+
9
+ class ExecutionResources:
10
+ """Specifies resources usage or resource limits for execution.
11
+
12
+ By default this class represents resource usage. Use `for_limits` or
13
+ set `default_to_inf` to True to create an object that represents resource limits.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ cpu: Optional[float] = None,
19
+ gpu: Optional[float] = None,
20
+ object_store_memory: Optional[float] = None,
21
+ default_to_inf: bool = False,
22
+ ):
23
+ """Initializes ExecutionResources.
24
+ Args:
25
+ cpu: Amount of logical CPU slots.
26
+ gpu: Amount of logical GPU slots.
27
+ object_store_memory: Amount of object store memory.
28
+ default_to_inf: When the object represents resource usage, this flag
29
+ should be set to False. And missing values will default to 0.
30
+ When the object represents resource limits, this flag should be
31
+ set to True. And missing values will default to infinity.
32
+ """
33
+ self._cpu = cpu
34
+ self._gpu = gpu
35
+ self._object_store_memory = object_store_memory
36
+ self._default_to_inf = default_to_inf
37
+
38
+ @classmethod
39
+ def from_resource_dict(
40
+ cls,
41
+ resource_dict: Dict[str, float],
42
+ default_to_inf: bool = False,
43
+ ):
44
+ """Create an ExecutionResources object from a resource dict."""
45
+ return ExecutionResources(
46
+ cpu=resource_dict.get("CPU", None),
47
+ gpu=resource_dict.get("GPU", None),
48
+ object_store_memory=resource_dict.get("object_store_memory", None),
49
+ default_to_inf=default_to_inf,
50
+ )
51
+
52
+ @classmethod
53
+ def for_limits(
54
+ cls,
55
+ cpu: Optional[float] = None,
56
+ gpu: Optional[float] = None,
57
+ object_store_memory: Optional[float] = None,
58
+ ) -> "ExecutionResources":
59
+ """Create an ExecutionResources object that represents resource limits.
60
+ Args:
61
+ cpu: Amount of logical CPU slots.
62
+ gpu: Amount of logical GPU slots.
63
+ object_store_memory: Amount of object store memory.
64
+ """
65
+ return ExecutionResources(
66
+ cpu=cpu,
67
+ gpu=gpu,
68
+ object_store_memory=object_store_memory,
69
+ default_to_inf=True,
70
+ )
71
+
72
+ @property
73
+ def cpu(self) -> float:
74
+ if self._cpu is not None:
75
+ return self._cpu
76
+ return 0.0 if not self._default_to_inf else float("inf")
77
+
78
+ @cpu.setter
79
+ def cpu(self, value: float):
80
+ self._cpu = value
81
+
82
+ @property
83
+ def gpu(self) -> float:
84
+ if self._gpu is not None:
85
+ return self._gpu
86
+ return 0.0 if not self._default_to_inf else float("inf")
87
+
88
+ @gpu.setter
89
+ def gpu(self, value: float):
90
+ self._gpu = value
91
+
92
+ @property
93
+ def object_store_memory(self) -> float:
94
+ if self._object_store_memory is not None:
95
+ return self._object_store_memory
96
+ return 0.0 if not self._default_to_inf else float("inf")
97
+
98
+ @object_store_memory.setter
99
+ def object_store_memory(self, value: float):
100
+ self._object_store_memory = value
101
+
102
+ def __repr__(self):
103
+ return (
104
+ f"ExecutionResources(cpu={self.cpu:.1f}, gpu={self.gpu:.1f}, "
105
+ f"object_store_memory={self.object_store_memory_str()})"
106
+ )
107
+
108
+ def __eq__(self, other: "ExecutionResources") -> bool:
109
+ return (
110
+ self.cpu == other.cpu
111
+ and self.gpu == other.gpu
112
+ and self.object_store_memory == other.object_store_memory
113
+ )
114
+
115
+ @classmethod
116
+ def zero(cls) -> "ExecutionResources":
117
+ """Returns an ExecutionResources object with zero resources."""
118
+ return ExecutionResources(0.0, 0.0, 0.0)
119
+
120
+ def is_zero(self) -> bool:
121
+ """Returns True if all resources are zero."""
122
+ return self.cpu == 0.0 and self.gpu == 0.0 and self.object_store_memory == 0.0
123
+
124
+ def is_non_negative(self) -> bool:
125
+ """Returns True if all resources are non-negative."""
126
+ return self.cpu >= 0 and self.gpu >= 0 and self.object_store_memory >= 0
127
+
128
+ def object_store_memory_str(self) -> str:
129
+ """Returns a human-readable string for the object store memory field."""
130
+ if self.object_store_memory == float("inf"):
131
+ return "inf"
132
+ return memory_string(self.object_store_memory)
133
+
134
+ def copy(self) -> "ExecutionResources":
135
+ """Returns a copy of this ExecutionResources object."""
136
+ return ExecutionResources(
137
+ self._cpu, self._gpu, self._object_store_memory, self._default_to_inf
138
+ )
139
+
140
+ def add(self, other: "ExecutionResources") -> "ExecutionResources":
141
+ """Adds execution resources.
142
+
143
+ Returns:
144
+ A new ExecutionResource object with summed resources.
145
+ """
146
+ return ExecutionResources(
147
+ self.cpu + other.cpu,
148
+ self.gpu + other.gpu,
149
+ self.object_store_memory + other.object_store_memory,
150
+ )
151
+
152
+ def subtract(self, other: "ExecutionResources") -> "ExecutionResources":
153
+ """Subtracts execution resources.
154
+
155
+ Returns:
156
+ A new ExecutionResource object with subtracted resources.
157
+ """
158
+ return ExecutionResources(
159
+ self.cpu - other.cpu,
160
+ self.gpu - other.gpu,
161
+ self.object_store_memory - other.object_store_memory,
162
+ )
163
+
164
+ def max(self, other: "ExecutionResources") -> "ExecutionResources":
165
+ """Returns the maximum for each resource type."""
166
+ return ExecutionResources(
167
+ cpu=max(self.cpu, other.cpu),
168
+ gpu=max(self.gpu, other.gpu),
169
+ object_store_memory=max(
170
+ self.object_store_memory, other.object_store_memory
171
+ ),
172
+ )
173
+
174
+ def min(self, other: "ExecutionResources") -> "ExecutionResources":
175
+ """Returns the minimum for each resource type."""
176
+ return ExecutionResources(
177
+ cpu=min(self.cpu, other.cpu),
178
+ gpu=min(self.gpu, other.gpu),
179
+ object_store_memory=min(
180
+ self.object_store_memory, other.object_store_memory
181
+ ),
182
+ )
183
+
184
+ def satisfies_limit(self, limit: "ExecutionResources") -> bool:
185
+ """Return if this resource struct meets the specified limits.
186
+
187
+ Note that None for a field means no limit.
188
+ """
189
+ return (
190
+ self.cpu <= limit.cpu
191
+ and self.gpu <= limit.gpu
192
+ and self.object_store_memory <= limit.object_store_memory
193
+ )
194
+
195
+ def scale(self, f: float) -> "ExecutionResources":
196
+ """Return copy with all set values scaled by `f`."""
197
+ if f < 0:
198
+ raise ValueError("Scaling factor must be non-negative.")
199
+ if f == 0:
200
+ # Explicitly handle the zero case, because `0 * inf` is undefined.
201
+ return ExecutionResources.zero()
202
+ return ExecutionResources(
203
+ cpu=self.cpu * f,
204
+ gpu=self.gpu * f,
205
+ object_store_memory=self.object_store_memory * f,
206
+ )
207
+
208
+
209
+ @DeveloperAPI
210
+ class ExecutionOptions:
211
+ """Common options for execution.
212
+
213
+ Some options may not be supported on all executors (e.g., resource limits).
214
+
215
+ Attributes:
216
+ resource_limits: Set a soft limit on the resource usage during execution.
217
+ Autodetected by default.
218
+ exclude_resources: Amount of resources to exclude from Ray Data.
219
+ Set this if you have other workloads running on the same cluster.
220
+ Note,
221
+ - If using Ray Data with Ray Train, training resources will be
222
+ automatically excluded.
223
+ - For each resource type, resource_limits and exclude_resources can
224
+ not be both set.
225
+ locality_with_output: Set this to prefer running tasks on the same node as the
226
+ output node (node driving the execution). It can also be set to a list of
227
+ node ids to spread the outputs across those nodes. Off by default.
228
+ preserve_order: Set this to preserve the ordering between blocks processed by
229
+ operators. Off by default.
230
+ actor_locality_enabled: Whether to enable locality-aware task dispatch to
231
+ actors (off by default). This parameter applies to both stateful map and
232
+ streaming_split operations.
233
+ verbose_progress: Whether to report progress individually per operator. By
234
+ default, only AllToAll operators and global progress is reported. This
235
+ option is useful for performance debugging. On by default.
236
+ """
237
+
238
+ def __init__(
239
+ self,
240
+ resource_limits: Optional[ExecutionResources] = None,
241
+ exclude_resources: Optional[ExecutionResources] = None,
242
+ locality_with_output: Union[bool, List[NodeIdStr]] = False,
243
+ preserve_order: bool = False,
244
+ # TODO(hchen): Re-enable `actor_locality_enabled` by default after fixing
245
+ # https://github.com/ray-project/ray/issues/43466
246
+ actor_locality_enabled: bool = False,
247
+ verbose_progress: Optional[bool] = None,
248
+ ):
249
+ if resource_limits is None:
250
+ resource_limits = ExecutionResources.for_limits()
251
+ self.resource_limits = resource_limits
252
+ if exclude_resources is None:
253
+ exclude_resources = ExecutionResources.zero()
254
+ self.exclude_resources = exclude_resources
255
+ self.locality_with_output = locality_with_output
256
+ self.preserve_order = preserve_order
257
+ self.actor_locality_enabled = actor_locality_enabled
258
+ if verbose_progress is None:
259
+ verbose_progress = bool(
260
+ int(os.environ.get("RAY_DATA_VERBOSE_PROGRESS", "1"))
261
+ )
262
+ self.verbose_progress = verbose_progress
263
+
264
+ def __repr__(self) -> str:
265
+ return (
266
+ f"ExecutionOptions(resource_limits={self.resource_limits}, "
267
+ f"exclude_resources={self.exclude_resources}, "
268
+ f"locality_with_output={self.locality_with_output}, "
269
+ f"preserve_order={self.preserve_order}, "
270
+ f"actor_locality_enabled={self.actor_locality_enabled}, "
271
+ f"verbose_progress={self.verbose_progress})"
272
+ )
273
+
274
+ @property
275
+ def resource_limits(self) -> ExecutionResources:
276
+ return self._resource_limits
277
+
278
+ @resource_limits.setter
279
+ def resource_limits(self, value: ExecutionResources) -> None:
280
+ self._resource_limits = ExecutionResources.for_limits(
281
+ cpu=value._cpu,
282
+ gpu=value._gpu,
283
+ object_store_memory=value._object_store_memory,
284
+ )
285
+
286
+ def is_resource_limits_default(self):
287
+ """Returns True if resource_limits is the default value."""
288
+ return self._resource_limits == ExecutionResources.for_limits()
289
+
290
+ def validate(self) -> None:
291
+ """Validate the options."""
292
+ for attr in ["cpu", "gpu", "object_store_memory"]:
293
+ if (
294
+ getattr(self.resource_limits, attr) != float("inf")
295
+ and getattr(self.exclude_resources, attr, 0) > 0
296
+ ):
297
+ raise ValueError(
298
+ "resource_limits and exclude_resources cannot "
299
+ f" both be set for {attr} resource."
300
+ )
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/executor.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterable, Iterator, Optional
2
+
3
+ from .execution_options import ExecutionOptions
4
+ from .physical_operator import PhysicalOperator
5
+ from .ref_bundle import RefBundle
6
+ from ray.data._internal.stats import DatasetStats
7
+
8
+
9
+ class OutputIterator(Iterator[RefBundle]):
10
+ """Iterator used to access the output of an Executor execution.
11
+
12
+ This is a blocking iterator. Datasets guarantees that all its iterators are
13
+ thread-safe (i.e., multiple threads can block on them at the same time).
14
+ """
15
+
16
+ def __init__(self, base: Iterable[RefBundle]):
17
+ self._it = iter(base)
18
+
19
+ def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle:
20
+ """Can be used to pull outputs by a specified output index.
21
+
22
+ This is used to support the streaming_split() API, where the output of a
23
+ streaming execution is to be consumed by multiple processes.
24
+
25
+ Args:
26
+ output_split_idx: The output split index to get results for. This arg is
27
+ only allowed for iterators created by `Dataset.streaming_split()`.
28
+
29
+ Raises:
30
+ StopIteration if there are no more outputs to return.
31
+ """
32
+ if output_split_idx is not None:
33
+ raise NotImplementedError()
34
+ return next(self._it)
35
+
36
+ def __next__(self) -> RefBundle:
37
+ return self.get_next()
38
+
39
+
40
+ class Executor:
41
+ """Abstract class for executors, which implement physical operator execution.
42
+
43
+ Subclasses:
44
+ StreamingExecutor
45
+ """
46
+
47
+ def __init__(self, options: ExecutionOptions):
48
+ """Create the executor."""
49
+ options.validate()
50
+ self._options = options
51
+
52
+ def execute(
53
+ self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None
54
+ ) -> OutputIterator:
55
+ """Start execution.
56
+
57
+ Args:
58
+ dag: The operator graph to execute.
59
+ initial_stats: The DatasetStats to prepend to the stats returned by the
60
+ executor. These stats represent actions done to compute inputs.
61
+ """
62
+ raise NotImplementedError
63
+
64
+ def shutdown(self):
65
+ """Shutdown an executor, which may still be running.
66
+
67
+ This should interrupt execution and clean up any used resources.
68
+ """
69
+ pass
70
+
71
+ def get_stats(self) -> DatasetStats:
72
+ """Return stats for the execution so far.
73
+
74
+ This is generally called after `execute` has completed, but may be called
75
+ while iterating over `execute` results for streaming execution.
76
+ """
77
+ raise NotImplementedError
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/op_runtime_metrics.py ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from dataclasses import Field, dataclass, field
3
+ from enum import Enum
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
5
+
6
+ import ray
7
+ from ray.data._internal.execution.bundle_queue import create_bundle_queue
8
+ from ray.data._internal.execution.interfaces.ref_bundle import RefBundle
9
+ from ray.data._internal.memory_tracing import trace_allocation
10
+
11
+ if TYPE_CHECKING:
12
+ from ray.data._internal.execution.interfaces.physical_operator import (
13
+ PhysicalOperator,
14
+ )
15
+
16
+
17
+ # A metadata key used to mark a dataclass field as a metric.
18
+ _IS_FIELD_METRIC_KEY = "__is_metric"
19
+ # Metadata keys used to store information about a metric.
20
+ _METRIC_FIELD_DESCRIPTION_KEY = "__metric_description"
21
+ _METRIC_FIELD_METRICS_GROUP_KEY = "__metric_metrics_group"
22
+ _METRIC_FIELD_IS_MAP_ONLY_KEY = "__metric_is_map_only"
23
+
24
+ _METRICS: List["MetricDefinition"] = []
25
+
26
+
27
+ class MetricsGroup(Enum):
28
+ INPUTS = "inputs"
29
+ OUTPUTS = "outputs"
30
+ TASKS = "tasks"
31
+ OBJECT_STORE_MEMORY = "object_store_memory"
32
+ MISC = "misc"
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class MetricDefinition:
37
+ """Metadata for a metric.
38
+
39
+ Args:
40
+ name: The name of the metric.
41
+ description: A human-readable description of the metric, also used as the chart
42
+ description on the Ray Data dashboard.
43
+ metrics_group: The group of the metric, used to organize metrics into groups in
44
+ 'StatsActor' and on the Ray Data dashboard.
45
+ map_only: Whether the metric is only measured for 'MapOperators'.
46
+ """
47
+
48
+ name: str
49
+ description: str
50
+ metrics_group: str
51
+ # TODO: Let's refactor this parameter so it isn't tightly coupled with a specific
52
+ # operator type (MapOperator).
53
+ map_only: bool = False
54
+
55
+
56
+ def metric_field(
57
+ *,
58
+ description: str,
59
+ metrics_group: str,
60
+ map_only: bool = False,
61
+ **field_kwargs,
62
+ ):
63
+ """A dataclass field that represents a metric."""
64
+ metadata = field_kwargs.get("metadata", {})
65
+
66
+ metadata[_IS_FIELD_METRIC_KEY] = True
67
+
68
+ metadata[_METRIC_FIELD_DESCRIPTION_KEY] = description
69
+ metadata[_METRIC_FIELD_METRICS_GROUP_KEY] = metrics_group
70
+ metadata[_METRIC_FIELD_IS_MAP_ONLY_KEY] = map_only
71
+
72
+ return field(metadata=metadata, **field_kwargs)
73
+
74
+
75
+ def metric_property(
76
+ *,
77
+ description: str,
78
+ metrics_group: str,
79
+ map_only: bool = False,
80
+ ):
81
+ """A property that represents a metric."""
82
+
83
+ def wrap(func):
84
+ metric = MetricDefinition(
85
+ name=func.__name__,
86
+ description=description,
87
+ metrics_group=metrics_group,
88
+ map_only=map_only,
89
+ )
90
+
91
+ _METRICS.append(metric)
92
+
93
+ return property(func)
94
+
95
+ return wrap
96
+
97
+
98
+ @dataclass
99
+ class RunningTaskInfo:
100
+ inputs: RefBundle
101
+ num_outputs: int
102
+ bytes_outputs: int
103
+
104
+
105
+ class OpRuntimesMetricsMeta(type):
106
+ def __init__(cls, name, bases, dict):
107
+ # NOTE: `Field.name` isn't set until the dataclass is created, so we can't
108
+ # create the metrics in `metric_field` directly.
109
+ super().__init__(name, bases, dict)
110
+
111
+ # Iterate over the attributes and methods of 'OpRuntimeMetrics'.
112
+ for name, value in dict.items():
113
+ # If an attribute is a dataclass field and has _IS_FIELD_METRIC_KEY in its
114
+ # metadata, then create a metric from the field metadata and add it to the
115
+ # list of metrics. See also the 'metric_field' function.
116
+ if isinstance(value, Field) and value.metadata.get(_IS_FIELD_METRIC_KEY):
117
+ metric = MetricDefinition(
118
+ name=name,
119
+ description=value.metadata[_METRIC_FIELD_DESCRIPTION_KEY],
120
+ metrics_group=value.metadata[_METRIC_FIELD_METRICS_GROUP_KEY],
121
+ map_only=value.metadata[_METRIC_FIELD_IS_MAP_ONLY_KEY],
122
+ )
123
+ _METRICS.append(metric)
124
+
125
+
126
+ @dataclass
127
+ class OpRuntimeMetrics(metaclass=OpRuntimesMetricsMeta):
128
+ """Runtime metrics for a 'PhysicalOperator'.
129
+
130
+ Metrics are updated dynamically during the execution of the Dataset.
131
+ This class can be used for either observablity or scheduling purposes.
132
+
133
+ DO NOT modify the fields of this class directly. Instead, use the provided
134
+ callback methods.
135
+ """
136
+
137
+ # TODO(hchen): Fields tagged with "map_only" currently only work for MapOperator.
138
+ # We should make them work for all operators by unifying the task execution code.
139
+
140
+ # === Inputs-related metrics ===
141
+ num_inputs_received: int = metric_field(
142
+ default=0,
143
+ description="Number of input blocks received by operator.",
144
+ metrics_group=MetricsGroup.INPUTS,
145
+ )
146
+ bytes_inputs_received: int = metric_field(
147
+ default=0,
148
+ description="Byte size of input blocks received by operator.",
149
+ metrics_group=MetricsGroup.INPUTS,
150
+ )
151
+ num_task_inputs_processed: int = metric_field(
152
+ default=0,
153
+ description=(
154
+ "Number of input blocks that operator's tasks have finished processing."
155
+ ),
156
+ metrics_group=MetricsGroup.INPUTS,
157
+ map_only=True,
158
+ )
159
+ bytes_task_inputs_processed: int = metric_field(
160
+ default=0,
161
+ description=(
162
+ "Byte size of input blocks that operator's tasks have finished processing."
163
+ ),
164
+ metrics_group=MetricsGroup.INPUTS,
165
+ map_only=True,
166
+ )
167
+ bytes_inputs_of_submitted_tasks: int = metric_field(
168
+ default=0,
169
+ description="Byte size of input blocks passed to submitted tasks.",
170
+ metrics_group=MetricsGroup.INPUTS,
171
+ map_only=True,
172
+ )
173
+
174
+ # === Outputs-related metrics ===
175
+ num_task_outputs_generated: int = metric_field(
176
+ default=0,
177
+ description="Number of output blocks generated by tasks.",
178
+ metrics_group=MetricsGroup.OUTPUTS,
179
+ map_only=True,
180
+ )
181
+ bytes_task_outputs_generated: int = metric_field(
182
+ default=0,
183
+ description="Byte size of output blocks generated by tasks.",
184
+ metrics_group=MetricsGroup.OUTPUTS,
185
+ map_only=True,
186
+ )
187
+ rows_task_outputs_generated: int = metric_field(
188
+ default=0,
189
+ description="Number of output rows generated by tasks.",
190
+ metrics_group=MetricsGroup.OUTPUTS,
191
+ map_only=True,
192
+ )
193
+ num_outputs_taken: int = metric_field(
194
+ default=0,
195
+ description=(
196
+ "Number of output blocks that are already taken by downstream operators."
197
+ ),
198
+ metrics_group=MetricsGroup.OUTPUTS,
199
+ )
200
+ bytes_outputs_taken: int = metric_field(
201
+ default=0,
202
+ description=(
203
+ "Byte size of output blocks that are already taken by downstream operators."
204
+ ),
205
+ metrics_group=MetricsGroup.OUTPUTS,
206
+ )
207
+ num_outputs_of_finished_tasks: int = metric_field(
208
+ default=0,
209
+ description="Number of generated output blocks that are from finished tasks.",
210
+ metrics_group=MetricsGroup.OUTPUTS,
211
+ map_only=True,
212
+ )
213
+ bytes_outputs_of_finished_tasks: int = metric_field(
214
+ default=0,
215
+ description=(
216
+ "Byte size of generated output blocks that are from finished tasks."
217
+ ),
218
+ metrics_group=MetricsGroup.OUTPUTS,
219
+ map_only=True,
220
+ )
221
+
222
+ # === Tasks-related metrics ===
223
+ num_tasks_submitted: int = metric_field(
224
+ default=0,
225
+ description="Number of submitted tasks.",
226
+ metrics_group=MetricsGroup.TASKS,
227
+ map_only=True,
228
+ )
229
+ num_tasks_running: int = metric_field(
230
+ default=0,
231
+ description="Number of running tasks.",
232
+ metrics_group=MetricsGroup.TASKS,
233
+ map_only=True,
234
+ )
235
+ num_tasks_have_outputs: int = metric_field(
236
+ default=0,
237
+ description="Number of tasks that already have output.",
238
+ metrics_group=MetricsGroup.TASKS,
239
+ map_only=True,
240
+ )
241
+ num_tasks_finished: int = metric_field(
242
+ default=0,
243
+ description="Number of finished tasks.",
244
+ metrics_group=MetricsGroup.TASKS,
245
+ map_only=True,
246
+ )
247
+ num_tasks_failed: int = metric_field(
248
+ default=0,
249
+ description="Number of failed tasks.",
250
+ metrics_group=MetricsGroup.TASKS,
251
+ map_only=True,
252
+ )
253
+ block_generation_time: float = metric_field(
254
+ default=0,
255
+ description="Time spent generating blocks in tasks.",
256
+ metrics_group=MetricsGroup.TASKS,
257
+ map_only=True,
258
+ )
259
+ task_submission_backpressure_time: float = metric_field(
260
+ default=0,
261
+ description="Time spent in task submission backpressure.",
262
+ metrics_group=MetricsGroup.TASKS,
263
+ )
264
+
265
+ # === Object store memory metrics ===
266
+ obj_store_mem_internal_inqueue_blocks: int = metric_field(
267
+ default=0,
268
+ description="Number of blocks in operator's internal input queue.",
269
+ metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
270
+ )
271
+ obj_store_mem_internal_outqueue_blocks: int = metric_field(
272
+ default=0,
273
+ description="Number of blocks in the operator's internal output queue.",
274
+ metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
275
+ )
276
+ obj_store_mem_freed: int = metric_field(
277
+ default=0,
278
+ description="Byte size of freed memory in object store.",
279
+ metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
280
+ map_only=True,
281
+ )
282
+ obj_store_mem_spilled: int = metric_field(
283
+ default=0,
284
+ description="Byte size of spilled memory in object store.",
285
+ metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
286
+ map_only=True,
287
+ )
288
+ obj_store_mem_used: int = metric_field(
289
+ default=0,
290
+ description="Byte size of used memory in object store.",
291
+ metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
292
+ )
293
+
294
+ # === Miscellaneous metrics ===
295
+ # Use "metrics_group: "misc" in the metadata for new metrics in this section.
296
+
297
+ def __init__(self, op: "PhysicalOperator"):
298
+ from ray.data._internal.execution.operators.map_operator import MapOperator
299
+
300
+ self._op = op
301
+ self._is_map = isinstance(op, MapOperator)
302
+ self._running_tasks: Dict[int, RunningTaskInfo] = {}
303
+ self._extra_metrics: Dict[str, Any] = {}
304
+ # Start time of current pause due to task submission backpressure
305
+ self._task_submission_backpressure_start_time = -1
306
+
307
+ self._internal_inqueue = create_bundle_queue()
308
+ self._internal_outqueue = create_bundle_queue()
309
+ self._pending_task_inputs = create_bundle_queue()
310
+
311
+ @property
312
+ def extra_metrics(self) -> Dict[str, Any]:
313
+ """Return a dict of extra metrics."""
314
+ return self._extra_metrics
315
+
316
+ @classmethod
317
+ def get_metrics(self) -> List[MetricDefinition]:
318
+ return list(_METRICS)
319
+
320
+ def as_dict(self):
321
+ """Return a dict representation of the metrics."""
322
+ result = []
323
+ for metric in self.get_metrics():
324
+ if not self._is_map and metric.map_only:
325
+ continue
326
+ value = getattr(self, metric.name)
327
+ result.append((metric.name, value))
328
+
329
+ # TODO: record resource usage in OpRuntimeMetrics,
330
+ # avoid calling self._op.current_processor_usage()
331
+ resource_usage = self._op.current_processor_usage()
332
+ result.extend(
333
+ [
334
+ ("cpu_usage", resource_usage.cpu or 0),
335
+ ("gpu_usage", resource_usage.gpu or 0),
336
+ ]
337
+ )
338
+ result.extend(self._extra_metrics.items())
339
+ return dict(result)
340
+
341
+ @metric_property(
342
+ description="Average number of blocks generated per task.",
343
+ metrics_group=MetricsGroup.OUTPUTS,
344
+ map_only=True,
345
+ )
346
+ def average_num_outputs_per_task(self) -> Optional[float]:
347
+ """Average number of output blocks per task, or None if no task has finished."""
348
+ if self.num_tasks_finished == 0:
349
+ return None
350
+ else:
351
+ return self.num_outputs_of_finished_tasks / self.num_tasks_finished
352
+
353
+ @metric_property(
354
+ description="Average size of task output in bytes.",
355
+ metrics_group=MetricsGroup.OUTPUTS,
356
+ map_only=True,
357
+ )
358
+ def average_bytes_per_output(self) -> Optional[float]:
359
+ """Average size in bytes of output blocks."""
360
+ if self.num_task_outputs_generated == 0:
361
+ return None
362
+ else:
363
+ return self.bytes_task_outputs_generated / self.num_task_outputs_generated
364
+
365
+ @metric_property(
366
+ description="Byte size of input blocks in the operator's internal input queue.",
367
+ metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
368
+ )
369
+ def obj_store_mem_internal_inqueue(self) -> int:
370
+ return self._internal_inqueue.estimate_size_bytes()
371
+
372
+ @metric_property(
373
+ description=(
374
+ "Byte size of output blocks in the operator's internal output queue."
375
+ ),
376
+ metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
377
+ )
378
+ def obj_store_mem_internal_outqueue(self) -> int:
379
+ return self._internal_outqueue.estimate_size_bytes()
380
+
381
+ @metric_property(
382
+ description="Byte size of input blocks used by pending tasks.",
383
+ metrics_group=MetricsGroup.OBJECT_STORE_MEMORY,
384
+ map_only=True,
385
+ )
386
+ def obj_store_mem_pending_task_inputs(self) -> int:
387
+ return self._pending_task_inputs.estimate_size_bytes()
388
+
389
+ @property
390
+ def obj_store_mem_pending_task_outputs(self) -> Optional[float]:
391
+ """Estimated size in bytes of output blocks in Ray generator buffers.
392
+
393
+ If an estimate isn't available, this property returns ``None``.
394
+ """
395
+ per_task_output = self.obj_store_mem_max_pending_output_per_task
396
+ if per_task_output is None:
397
+ return None
398
+
399
+ # Ray Data launches multiple tasks per actor, but only one task runs at a
400
+ # time per actor. So, the number of actually running tasks is capped by the
401
+ # number of active actors.
402
+ from ray.data._internal.execution.operators.actor_pool_map_operator import (
403
+ ActorPoolMapOperator,
404
+ )
405
+
406
+ num_tasks_running = self.num_tasks_running
407
+ if isinstance(self._op, ActorPoolMapOperator):
408
+ num_tasks_running = min(
409
+ num_tasks_running, self._op._actor_pool.num_active_actors()
410
+ )
411
+
412
+ return num_tasks_running * per_task_output
413
+
414
+ @property
415
+ def obj_store_mem_max_pending_output_per_task(self) -> Optional[float]:
416
+ """Estimated size in bytes of output blocks in a task's generator buffer."""
417
+ context = self._op.data_context
418
+ if context._max_num_blocks_in_streaming_gen_buffer is None:
419
+ return None
420
+
421
+ bytes_per_output = self.average_bytes_per_output
422
+ if bytes_per_output is None:
423
+ bytes_per_output = context.target_max_block_size
424
+
425
+ num_pending_outputs = context._max_num_blocks_in_streaming_gen_buffer
426
+ if self.average_num_outputs_per_task is not None:
427
+ num_pending_outputs = min(
428
+ num_pending_outputs, self.average_num_outputs_per_task
429
+ )
430
+ return bytes_per_output * num_pending_outputs
431
+
432
+ @metric_property(
433
+ description="Average size of task inputs in bytes.",
434
+ metrics_group=MetricsGroup.INPUTS,
435
+ map_only=True,
436
+ )
437
+ def average_bytes_inputs_per_task(self) -> Optional[float]:
438
+ """Average size in bytes of ref bundles passed to tasks, or ``None`` if no
439
+ tasks have been submitted."""
440
+ if self.num_tasks_submitted == 0:
441
+ return None
442
+ else:
443
+ return self.bytes_inputs_of_submitted_tasks / self.num_tasks_submitted
444
+
445
+ @metric_property(
446
+ description="Average total output size of task in bytes.",
447
+ metrics_group=MetricsGroup.OUTPUTS,
448
+ map_only=True,
449
+ )
450
+ def average_bytes_outputs_per_task(self) -> Optional[float]:
451
+ """Average size in bytes of output blocks per task,
452
+ or None if no task has finished."""
453
+ if self.num_tasks_finished == 0:
454
+ return None
455
+ else:
456
+ return self.bytes_outputs_of_finished_tasks / self.num_tasks_finished
457
+
458
+ def on_input_received(self, input: RefBundle):
459
+ """Callback when the operator receives a new input."""
460
+ self.num_inputs_received += 1
461
+ self.bytes_inputs_received += input.size_bytes()
462
+
463
+ def on_input_queued(self, input: RefBundle):
464
+ """Callback when the operator queues an input."""
465
+ self.obj_store_mem_internal_inqueue_blocks += len(input.blocks)
466
+ self._internal_inqueue.add(input)
467
+
468
+ def on_input_dequeued(self, input: RefBundle):
469
+ """Callback when the operator dequeues an input."""
470
+ self.obj_store_mem_internal_inqueue_blocks -= len(input.blocks)
471
+ input_size = input.size_bytes()
472
+ self._internal_inqueue.remove(input)
473
+ assert self.obj_store_mem_internal_inqueue >= 0, (
474
+ self._op,
475
+ self.obj_store_mem_internal_inqueue,
476
+ input_size,
477
+ )
478
+
479
+ def on_output_queued(self, output: RefBundle):
480
+ """Callback when an output is queued by the operator."""
481
+ self.obj_store_mem_internal_outqueue_blocks += len(output.blocks)
482
+ self._internal_outqueue.add(output)
483
+
484
+ def on_output_dequeued(self, output: RefBundle):
485
+ """Callback when an output is dequeued by the operator."""
486
+ self.obj_store_mem_internal_outqueue_blocks -= len(output.blocks)
487
+ output_size = output.size_bytes()
488
+ self._internal_outqueue.remove(output)
489
+ assert self.obj_store_mem_internal_outqueue >= 0, (
490
+ self._op,
491
+ self.obj_store_mem_internal_outqueue,
492
+ output_size,
493
+ )
494
+
495
+ def on_toggle_task_submission_backpressure(self, in_backpressure):
496
+ if in_backpressure and self._task_submission_backpressure_start_time == -1:
497
+ # backpressure starting, start timer
498
+ self._task_submission_backpressure_start_time = time.perf_counter()
499
+ elif self._task_submission_backpressure_start_time != -1:
500
+ # backpressure stopping, stop timer
501
+ self.task_submission_backpressure_time += (
502
+ time.perf_counter() - self._task_submission_backpressure_start_time
503
+ )
504
+ self._task_submission_backpressure_start_time = -1
505
+
506
+ def on_output_taken(self, output: RefBundle):
507
+ """Callback when an output is taken from the operator."""
508
+ self.num_outputs_taken += 1
509
+ self.bytes_outputs_taken += output.size_bytes()
510
+
511
+ def on_task_submitted(self, task_index: int, inputs: RefBundle):
512
+ """Callback when the operator submits a task."""
513
+ self.num_tasks_submitted += 1
514
+ self.num_tasks_running += 1
515
+ self.bytes_inputs_of_submitted_tasks += inputs.size_bytes()
516
+ self._pending_task_inputs.add(inputs)
517
+ self._running_tasks[task_index] = RunningTaskInfo(inputs, 0, 0)
518
+
519
+ def on_task_output_generated(self, task_index: int, output: RefBundle):
520
+ """Callback when a new task generates an output."""
521
+ num_outputs = len(output)
522
+ output_bytes = output.size_bytes()
523
+
524
+ self.num_task_outputs_generated += num_outputs
525
+ self.bytes_task_outputs_generated += output_bytes
526
+
527
+ task_info = self._running_tasks[task_index]
528
+ if task_info.num_outputs == 0:
529
+ self.num_tasks_have_outputs += 1
530
+ task_info.num_outputs += num_outputs
531
+ task_info.bytes_outputs += output_bytes
532
+
533
+ for block_ref, meta in output.blocks:
534
+ assert meta.exec_stats and meta.exec_stats.wall_time_s
535
+ self.block_generation_time += meta.exec_stats.wall_time_s
536
+ assert meta.num_rows is not None
537
+ self.rows_task_outputs_generated += meta.num_rows
538
+ trace_allocation(block_ref, "operator_output")
539
+
540
+ def on_task_finished(self, task_index: int, exception: Optional[Exception]):
541
+ """Callback when a task is finished."""
542
+ self.num_tasks_running -= 1
543
+ self.num_tasks_finished += 1
544
+ if exception is not None:
545
+ self.num_tasks_failed += 1
546
+
547
+ task_info = self._running_tasks[task_index]
548
+ self.num_outputs_of_finished_tasks += task_info.num_outputs
549
+ self.bytes_outputs_of_finished_tasks += task_info.bytes_outputs
550
+
551
+ inputs = self._running_tasks[task_index].inputs
552
+ self.num_task_inputs_processed += len(inputs)
553
+ total_input_size = inputs.size_bytes()
554
+ self.bytes_task_inputs_processed += total_input_size
555
+ input_size = inputs.size_bytes()
556
+ self._pending_task_inputs.remove(inputs)
557
+ assert self.obj_store_mem_pending_task_inputs >= 0, (
558
+ self._op,
559
+ self.obj_store_mem_pending_task_inputs,
560
+ input_size,
561
+ )
562
+
563
+ ctx = self._op.data_context
564
+ if ctx.enable_get_object_locations_for_metrics:
565
+ locations = ray.experimental.get_object_locations(inputs.block_refs)
566
+ for block, meta in inputs.blocks:
567
+ if locations[block].get("did_spill", False):
568
+ assert meta.size_bytes is not None
569
+ self.obj_store_mem_spilled += meta.size_bytes
570
+
571
+ self.obj_store_mem_freed += total_input_size
572
+
573
+ inputs.destroy_if_owned()
574
+ del self._running_tasks[task_index]
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Union
3
+
4
+ import ray
5
+ from .ref_bundle import RefBundle
6
+ from ray._raylet import ObjectRefGenerator
7
+ from ray.data._internal.execution.autoscaler.autoscaling_actor_pool import (
8
+ AutoscalingActorPool,
9
+ )
10
+ from ray.data._internal.execution.interfaces.execution_options import (
11
+ ExecutionOptions,
12
+ ExecutionResources,
13
+ )
14
+ from ray.data._internal.execution.interfaces.op_runtime_metrics import OpRuntimeMetrics
15
+ from ray.data._internal.logical.interfaces import LogicalOperator, Operator
16
+ from ray.data._internal.stats import StatsDict
17
+ from ray.data.context import DataContext
18
+
19
+ # TODO(hchen): Ray Core should have a common interface for these two types.
20
+ Waitable = Union[ray.ObjectRef, ObjectRefGenerator]
21
+
22
+
23
+ class OpTask(ABC):
24
+ """Abstract class that represents a task that is created by an PhysicalOperator.
25
+
26
+ The task can be either a regular task or an actor task.
27
+ """
28
+
29
+ def __init__(self, task_index: int):
30
+ self._task_index = task_index
31
+
32
+ def task_index(self) -> int:
33
+ """Return the index of the task."""
34
+ return self._task_index
35
+
36
+ @abstractmethod
37
+ def get_waitable(self) -> Waitable:
38
+ """Return the ObjectRef or ObjectRefGenerator to wait on."""
39
+ pass
40
+
41
+
42
+ class DataOpTask(OpTask):
43
+ """Represents an OpTask that handles Block data."""
44
+
45
+ def __init__(
46
+ self,
47
+ task_index: int,
48
+ streaming_gen: ObjectRefGenerator,
49
+ output_ready_callback: Callable[[RefBundle], None],
50
+ task_done_callback: Callable[[Optional[Exception]], None],
51
+ ):
52
+ """
53
+ Args:
54
+ streaming_gen: The streaming generator of this task. It should yield blocks.
55
+ output_ready_callback: The callback to call when a new RefBundle is output
56
+ from the generator.
57
+ task_done_callback: The callback to call when the task is done.
58
+ """
59
+ super().__init__(task_index)
60
+ # TODO(hchen): Right now, the streaming generator is required to yield a Block
61
+ # and a BlockMetadata each time. We should unify task submission with an unified
62
+ # interface. So each individual operator don't need to take care of the
63
+ # BlockMetadata.
64
+ self._streaming_gen = streaming_gen
65
+ self._output_ready_callback = output_ready_callback
66
+ self._task_done_callback = task_done_callback
67
+
68
+ def get_waitable(self) -> ObjectRefGenerator:
69
+ return self._streaming_gen
70
+
71
+ def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int:
72
+ """Callback when data is ready to be read from the streaming generator.
73
+
74
+ Args:
75
+ max_bytes_to_read: Max bytes of blocks to read. If None, all available
76
+ will be read.
77
+ Returns: The number of blocks read.
78
+ """
79
+ bytes_read = 0
80
+ while max_bytes_to_read is None or bytes_read < max_bytes_to_read:
81
+ try:
82
+ block_ref = self._streaming_gen._next_sync(0)
83
+ if block_ref.is_nil():
84
+ # The generator currently doesn't have new output.
85
+ # And it's not stopped yet.
86
+ break
87
+ except StopIteration:
88
+ self._task_done_callback(None)
89
+ break
90
+
91
+ try:
92
+ meta = ray.get(next(self._streaming_gen))
93
+ except StopIteration:
94
+ # The generator should always yield 2 values (block and metadata)
95
+ # each time. If we get a StopIteration here, it means an error
96
+ # happened in the task.
97
+ # And in this case, the block_ref is the exception object.
98
+ # TODO(hchen): Ray Core should have a better interface for
99
+ # detecting and obtaining the exception.
100
+ try:
101
+ ray.get(block_ref)
102
+ assert False, "Above ray.get should raise an exception."
103
+ except Exception as ex:
104
+ self._task_done_callback(ex)
105
+ raise ex from None
106
+ self._output_ready_callback(
107
+ RefBundle([(block_ref, meta)], owns_blocks=True)
108
+ )
109
+ bytes_read += meta.size_bytes
110
+ return bytes_read
111
+
112
+
113
+ class MetadataOpTask(OpTask):
114
+ """Represents an OpTask that only handles metadata, instead of Block data."""
115
+
116
+ def __init__(
117
+ self,
118
+ task_index: int,
119
+ object_ref: ray.ObjectRef,
120
+ task_done_callback: Callable[[], None],
121
+ ):
122
+ """
123
+ Args:
124
+ object_ref: The ObjectRef of the task.
125
+ task_done_callback: The callback to call when the task is done.
126
+ """
127
+ super().__init__(task_index)
128
+ self._object_ref = object_ref
129
+ self._task_done_callback = task_done_callback
130
+
131
+ def get_waitable(self) -> ray.ObjectRef:
132
+ return self._object_ref
133
+
134
+ def on_task_finished(self):
135
+ """Callback when the task is finished."""
136
+ self._task_done_callback()
137
+
138
+
139
+ class PhysicalOperator(Operator):
140
+ """Abstract class for physical operators.
141
+
142
+ An operator transforms one or more input streams of RefBundles into a single
143
+ output stream of RefBundles.
144
+
145
+ Physical operators are stateful and non-serializable; they live on the driver side
146
+ of the Dataset only.
147
+
148
+ Here's a simple example of implementing a basic "Map" operator:
149
+
150
+ class MapOperator(PhysicalOperator):
151
+ def __init__(self):
152
+ self.active_tasks = []
153
+
154
+ def add_input(self, refs, _):
155
+ self.active_tasks.append(map_task.remote(refs))
156
+
157
+ def has_next(self):
158
+ ready, _ = ray.wait(self.active_tasks, timeout=0)
159
+ return len(ready) > 0
160
+
161
+ def get_next(self):
162
+ ready, remaining = ray.wait(self.active_tasks, num_returns=1)
163
+ self.active_tasks = remaining
164
+ return ready[0]
165
+
166
+ Note that the above operator fully supports both bulk and streaming execution,
167
+ since `add_input` and `get_next` can be called in any order. In bulk execution
168
+ (now deprecated), all inputs would be added up-front, but in streaming
169
+ execution (now the default execution mode) the calls could be interleaved.
170
+ """
171
+
172
+ def __init__(
173
+ self,
174
+ name: str,
175
+ input_dependencies: List["PhysicalOperator"],
176
+ data_context: DataContext,
177
+ target_max_block_size: Optional[int],
178
+ ):
179
+ super().__init__(name, input_dependencies)
180
+
181
+ for x in input_dependencies:
182
+ assert isinstance(x, PhysicalOperator), x
183
+ self._inputs_complete = not input_dependencies
184
+ self._target_max_block_size = target_max_block_size
185
+ self._started = False
186
+ self._in_task_submission_backpressure = False
187
+ self._in_task_output_backpressure = False
188
+ self._metrics = OpRuntimeMetrics(self)
189
+ self._estimated_num_output_bundles = None
190
+ self._estimated_output_num_rows = None
191
+ self._execution_completed = False
192
+ # The LogicalOperator(s) which were translated to create this PhysicalOperator.
193
+ # Set via `PhysicalOperator.set_logical_operators()`.
194
+ self._logical_operators: List[LogicalOperator] = []
195
+ self._data_context = data_context
196
+
197
+ def __reduce__(self):
198
+ raise ValueError("Operator is not serializable.")
199
+
200
+ @property
201
+ def data_context(self) -> DataContext:
202
+ return self._data_context
203
+
204
+ # Override the following 3 methods to correct type hints.
205
+
206
+ @property
207
+ def input_dependencies(self) -> List["PhysicalOperator"]:
208
+ return super().input_dependencies # type: ignore
209
+
210
+ @property
211
+ def output_dependencies(self) -> List["PhysicalOperator"]:
212
+ return super().output_dependencies # type: ignore
213
+
214
+ def post_order_iter(self) -> Iterator["PhysicalOperator"]:
215
+ return super().post_order_iter() # type: ignore
216
+
217
+ def set_logical_operators(
218
+ self,
219
+ *logical_ops: LogicalOperator,
220
+ ):
221
+ self._logical_operators = list(logical_ops)
222
+
223
+ @property
224
+ def target_max_block_size(self) -> Optional[int]:
225
+ """
226
+ Target max block size output by this operator. If this returns None,
227
+ then the default from DataContext should be used.
228
+ """
229
+ return self._target_max_block_size
230
+
231
+ @property
232
+ def actual_target_max_block_size(self) -> int:
233
+ """
234
+ The actual target max block size output by this operator.
235
+ """
236
+ target_max_block_size = self._target_max_block_size
237
+ if target_max_block_size is None:
238
+ target_max_block_size = self.data_context.target_max_block_size
239
+ return target_max_block_size
240
+
241
+ def set_target_max_block_size(self, target_max_block_size: Optional[int]):
242
+ self._target_max_block_size = target_max_block_size
243
+
244
+ def mark_execution_completed(self):
245
+ """Manually mark this operator has completed execution."""
246
+ self._execution_completed = True
247
+
248
+ def completed(self) -> bool:
249
+ """Return True when this operator is completed.
250
+
251
+ An operator is completed the operator has stopped execution and all
252
+ outputs are taken.
253
+ """
254
+ if not self._execution_completed:
255
+ if self._inputs_complete and self.num_active_tasks() == 0:
256
+ # If all inputs are complete and there are no active tasks,
257
+ # then the operator has completed execution.
258
+ self._execution_completed = True
259
+ return self._execution_completed and not self.has_next()
260
+
261
+ def get_stats(self) -> StatsDict:
262
+ """Return recorded execution stats for use with DatasetStats."""
263
+ raise NotImplementedError
264
+
265
+ @property
266
+ def metrics(self) -> OpRuntimeMetrics:
267
+ """Returns the runtime metrics of this operator."""
268
+ self._metrics._extra_metrics = self._extra_metrics()
269
+ return self._metrics
270
+
271
+ def _extra_metrics(self) -> Dict[str, Any]:
272
+ """Subclasses should override this method to report extra metrics
273
+ that are specific to them."""
274
+ return {}
275
+
276
+ def progress_str(self) -> str:
277
+ """Return any extra status to be displayed in the operator progress bar.
278
+
279
+ For example, `<N> actors` to show current number of actors in an actor pool.
280
+ """
281
+ return ""
282
+
283
+ def num_outputs_total(self) -> Optional[int]:
284
+ """Returns the total number of output bundles of this operator,
285
+ or ``None`` if unable to provide a reasonable estimate (for example,
286
+ if no tasks have finished yet).
287
+
288
+ The value returned may be an estimate based off the consumption so far.
289
+ This is useful for reporting progress.
290
+
291
+ Subclasses should either override this method, or update
292
+ ``self._estimated_num_output_bundles`` appropriately.
293
+ """
294
+ return self._estimated_num_output_bundles
295
+
296
+ def num_output_rows_total(self) -> Optional[int]:
297
+ """Returns the total number of output rows of this operator,
298
+ or ``None`` if unable to provide a reasonable estimate (for example,
299
+ if no tasks have finished yet).
300
+
301
+ The value returned may be an estimate based off the consumption so far.
302
+ This is useful for reporting progress.
303
+
304
+ Subclasses should either override this method, or update
305
+ ``self._estimated_output_num_rows`` appropriately.
306
+ """
307
+ return self._estimated_output_num_rows
308
+
309
+ def start(self, options: ExecutionOptions) -> None:
310
+ """Called by the executor when execution starts for an operator.
311
+
312
+ Args:
313
+ options: The global options used for the overall execution.
314
+ """
315
+ self._started = True
316
+
317
+ def should_add_input(self) -> bool:
318
+ """Return whether it is desirable to add input to this operator right now.
319
+
320
+ Operators can customize the implementation of this method to apply additional
321
+ backpressure (e.g., waiting for internal actors to be created).
322
+ """
323
+ return True
324
+
325
+ def add_input(self, refs: RefBundle, input_index: int) -> None:
326
+ """Called when an upstream result is available.
327
+
328
+ Inputs may be added in any order, and calls to `add_input` may be interleaved
329
+ with calls to `get_next` / `has_next` to implement streaming execution.
330
+
331
+ Subclasses should override `_add_input_inner` instead of this method.
332
+
333
+ Args:
334
+ refs: The ref bundle that should be added as input.
335
+ input_index: The index identifying the input dependency producing the
336
+ input. For most operators, this is always `0` since there is only
337
+ one upstream input operator.
338
+ """
339
+ self._metrics.on_input_received(refs)
340
+ self._add_input_inner(refs, input_index)
341
+
342
+ def _add_input_inner(self, refs: RefBundle, input_index: int) -> None:
343
+ """Subclasses should override this method to implement `add_input`."""
344
+ raise NotImplementedError
345
+
346
+ def input_done(self, input_index: int) -> None:
347
+ """Called when the upstream operator at index `input_index` has completed().
348
+
349
+ After this is called, the executor guarantees that no more inputs will be added
350
+ via `add_input` for the given input index.
351
+ """
352
+ pass
353
+
354
+ def all_inputs_done(self) -> None:
355
+ """Called when all upstream operators have completed().
356
+
357
+ After this is called, the executor guarantees that no more inputs will be added
358
+ via `add_input` for any input index.
359
+ """
360
+ self._inputs_complete = True
361
+
362
+ def has_next(self) -> bool:
363
+ """Returns when a downstream output is available.
364
+
365
+ When this returns true, it is safe to call `get_next()`.
366
+ """
367
+ raise NotImplementedError
368
+
369
+ def get_next(self) -> RefBundle:
370
+ """Get the next downstream output.
371
+
372
+ It is only allowed to call this if `has_next()` has returned True.
373
+
374
+ Subclasses should override `_get_next_inner` instead of this method.
375
+ """
376
+ output = self._get_next_inner()
377
+ self._metrics.on_output_taken(output)
378
+ return output
379
+
380
+ def _get_next_inner(self) -> RefBundle:
381
+ """Subclasses should override this method to implement `get_next`."""
382
+ raise NotImplementedError
383
+
384
+ def get_active_tasks(self) -> List[OpTask]:
385
+ """Get a list of the active tasks of this operator.
386
+
387
+ Subclasses should return *all* running normal/actor tasks. The
388
+ StreamingExecutor will wait on these tasks and trigger callbacks.
389
+ """
390
+ return []
391
+
392
+ def num_active_tasks(self) -> int:
393
+ """Return the number of active tasks.
394
+
395
+ This method is used for 2 purposes:
396
+ * Determine if this operator is completed.
397
+ * Displaying active task info in the progress bar.
398
+ Thus, the return value can be less than `len(get_active_tasks())`,
399
+ if some tasks are not needed for the above purposes. E.g., for the
400
+ actor pool map operator, readiness checking tasks can be excluded
401
+ from `num_active_tasks`, but they should be included in
402
+ `get_active_tasks`.
403
+
404
+ Subclasses can override this as a performance optimization.
405
+ """
406
+ return len(self.get_active_tasks())
407
+
408
+ def throttling_disabled(self) -> bool:
409
+ """Whether to disable resource throttling for this operator.
410
+
411
+ This should return True for operators that only manipulate bundle metadata
412
+ (e.g., the OutputSplitter operator). This hints to the execution engine that
413
+ these operators should not be throttled based on resource usage.
414
+ """
415
+ return False
416
+
417
+ def internal_queue_size(self) -> int:
418
+ """If the operator has an internal input queue, return its size.
419
+
420
+ This is used to report tasks pending submission to actor pools.
421
+ """
422
+ return 0
423
+
424
+ def shutdown(self) -> None:
425
+ """Abort execution and release all resources used by this operator.
426
+
427
+ This release any Ray resources acquired by this operator such as active
428
+ tasks, actors, and objects.
429
+ """
430
+ if not self._started:
431
+ raise ValueError("Operator must be started before being shutdown.")
432
+
433
+ def current_processor_usage(self) -> ExecutionResources:
434
+ """Returns the current estimated CPU and GPU usage of this operator, excluding
435
+ object store memory.
436
+
437
+ This method is called by the executor to decide how to allocate processors
438
+ between different operators.
439
+ """
440
+ return ExecutionResources(0, 0, 0)
441
+
442
+ def running_processor_usage(self) -> ExecutionResources:
443
+ """Returns the estimated running CPU and GPU usage of this operator, excluding
444
+ object store memory.
445
+
446
+ This method is called by the resource manager and the streaming
447
+ executor to display the number of currently running CPUs and GPUs in the
448
+ progress bar.
449
+
450
+ Note, this method returns `current_processor_usage() -
451
+ pending_processor_usage()` by default. Subclasses should only override
452
+ `pending_processor_usage()` if needed.
453
+ """
454
+ usage = self.current_processor_usage()
455
+ usage = usage.subtract(self.pending_processor_usage())
456
+ return usage
457
+
458
+ def pending_processor_usage(self) -> ExecutionResources:
459
+ """Returns the estimated pending CPU and GPU usage of this operator, excluding
460
+ object store memory.
461
+
462
+ This method is called by the resource manager and the streaming
463
+ executor to display the number of currently pending actors in the
464
+ progress bar.
465
+ """
466
+ return ExecutionResources(0, 0, 0)
467
+
468
+ def base_resource_usage(self) -> ExecutionResources:
469
+ """Returns the minimum amount of resources required for execution.
470
+
471
+ For example, an operator that creates an actor pool requiring 8 GPUs could
472
+ return ExecutionResources(gpu=8) as its base usage.
473
+ """
474
+ return ExecutionResources()
475
+
476
+ def incremental_resource_usage(self) -> ExecutionResources:
477
+ """Returns the incremental resources required for processing another input.
478
+
479
+ For example, an operator that launches a task per input could return
480
+ ExecutionResources(cpu=1) as its incremental usage.
481
+ """
482
+ return ExecutionResources()
483
+
484
+ def notify_in_task_submission_backpressure(self, in_backpressure: bool) -> None:
485
+ """Called periodically from the executor to update internal in backpressure
486
+ status for stats collection purposes.
487
+
488
+ Args:
489
+ in_backpressure: Value this operator's in_backpressure should be set to.
490
+ """
491
+ # only update on change to in_backpressure
492
+ if self._in_task_submission_backpressure != in_backpressure:
493
+ self._metrics.on_toggle_task_submission_backpressure(in_backpressure)
494
+ self._in_task_submission_backpressure = in_backpressure
495
+
496
+ def get_autoscaling_actor_pools(self) -> List[AutoscalingActorPool]:
497
+ """Return a list of `AutoscalingActorPool`s managed by this operator."""
498
+ return []
499
+
500
+ def implements_accurate_memory_accounting(self) -> bool:
501
+ """Return whether this operator implements accurate memory accounting.
502
+
503
+ An operator that implements accurate memory accounting should should properly
504
+ report its memory usage via the following APIs:
505
+ - `self._metrics.on_input_queued`.
506
+ - `self._metrics.on_input_dequeued`.
507
+ - `self._metrics.on_output_queued`.
508
+ - `self._metrics.on_output_dequeued`.
509
+ """
510
+ # TODO(hchen): Currently we only enable `ReservationOpResourceAllocator` when
511
+ # all operators in the dataset have implemented accurate memory accounting.
512
+ # Eventually all operators should implement accurate memory accounting.
513
+ return False
514
+
515
+ def supports_fusion(self) -> bool:
516
+ """Returns ```True``` if this operator can be fused with other operators."""
517
+ return False
518
+
519
+ def update_resource_usage(self) -> None:
520
+ """Updates resource usage of this operator at runtime.
521
+
522
+ This method will be called at runtime in each StreamingExecutor iteration.
523
+ Subclasses can override it to account for dynamic resource usage updates due to
524
+ restarting actors, retrying tasks, lost objects, etc.
525
+ """
526
+ pass
527
+
528
+ def actor_info_progress_str(self) -> str:
529
+ """Returns Actor progress strings for Alive, Restarting and Pending Actors.
530
+
531
+ This method will be called in summary_str API in OpState. Subcallses can
532
+ override it to return Actor progress strings for Alive, Restarting and Pending
533
+ Actors.
534
+ """
535
+ return ""
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/ref_bundle.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Iterator, List, Optional, Tuple
3
+
4
+ import ray
5
+ from .common import NodeIdStr
6
+ from ray.data._internal.memory_tracing import trace_deallocation
7
+ from ray.data.block import Block, BlockMetadata
8
+ from ray.data.context import DataContext
9
+ from ray.types import ObjectRef
10
+
11
+
12
+ @dataclass
13
+ class RefBundle:
14
+ """A group of data block references and their metadata.
15
+
16
+ Operators take in and produce streams of RefBundles.
17
+
18
+ Most commonly a RefBundle consists of a single block object reference.
19
+ In some cases, e.g., due to block splitting, or for a reduce task, there may
20
+ be more than one block.
21
+
22
+ Block bundles have ownership semantics, i.e., shared ownership (similar to C++
23
+ shared_ptr, multiple operators share the same block bundle), or unique ownership
24
+ (similar to C++ unique_ptr, only one operator owns the block bundle). This
25
+ allows operators to know whether they can destroy blocks when they don't need
26
+ them. Destroying blocks eagerly is more efficient than waiting for Python GC /
27
+ Ray reference counting to kick in.
28
+ """
29
+
30
+ # The size_bytes must be known in the metadata, num_rows is optional.
31
+ blocks: Tuple[Tuple[ObjectRef[Block], BlockMetadata]]
32
+
33
+ # Whether we own the blocks (can safely destroy them).
34
+ owns_blocks: bool
35
+
36
+ # This attribute is used by the split() operator to assign bundles to logical
37
+ # output splits. It is otherwise None.
38
+ output_split_idx: Optional[int] = None
39
+
40
+ # Cached location, used for get_cached_location().
41
+ _cached_location: Optional[NodeIdStr] = None
42
+
43
+ def __post_init__(self):
44
+ if not isinstance(self.blocks, tuple):
45
+ object.__setattr__(self, "blocks", tuple(self.blocks))
46
+ for b in self.blocks:
47
+ assert isinstance(b, tuple), b
48
+ assert len(b) == 2, b
49
+ assert isinstance(b[0], ray.ObjectRef), b
50
+ assert isinstance(b[1], BlockMetadata), b
51
+ if b[1].size_bytes is None:
52
+ raise ValueError(
53
+ "The size in bytes of the block must be known: {}".format(b)
54
+ )
55
+
56
+ def __setattr__(self, key, value):
57
+ if hasattr(self, key) and key in ["blocks", "owns_blocks"]:
58
+ raise ValueError(f"The `{key}` field of RefBundle cannot be updated.")
59
+ object.__setattr__(self, key, value)
60
+
61
+ @property
62
+ def block_refs(self) -> List[ObjectRef[Block]]:
63
+ """List of block references in this bundle."""
64
+ return [block_ref for block_ref, _ in self.blocks]
65
+
66
+ @property
67
+ def metadata(self) -> List[BlockMetadata]:
68
+ """List of block metadata in this bundle."""
69
+ return [metadata for _, metadata in self.blocks]
70
+
71
+ def num_rows(self) -> Optional[int]:
72
+ """Number of rows present in this bundle, if known."""
73
+ total = 0
74
+ for m in self.metadata:
75
+ if m.num_rows is None:
76
+ return None
77
+ else:
78
+ total += m.num_rows
79
+ return total
80
+
81
+ def size_bytes(self) -> int:
82
+ """Size of the blocks of this bundle in bytes."""
83
+ return sum(m.size_bytes for m in self.metadata)
84
+
85
+ def destroy_if_owned(self) -> int:
86
+ """Clears the object store memory for these blocks if owned.
87
+
88
+ Returns:
89
+ The number of bytes freed.
90
+ """
91
+ should_free = self.owns_blocks and DataContext.get_current().eager_free
92
+ for block_ref in self.block_refs:
93
+ trace_deallocation(
94
+ block_ref, "RefBundle.destroy_if_owned", free=should_free
95
+ )
96
+ return self.size_bytes() if should_free else 0
97
+
98
+ def get_cached_location(self) -> Optional[NodeIdStr]:
99
+ """Return a location for this bundle's data, if possible.
100
+
101
+ Caches the resolved location so multiple calls to this are efficient.
102
+ """
103
+ if self._cached_location is None:
104
+ # Only consider the first block in the bundle for now. TODO(ekl) consider
105
+ # taking into account other blocks.
106
+ ref = self.block_refs[0]
107
+ # This call is pretty fast for owned objects (~5k/s), so we don't need to
108
+ # batch it for now.
109
+ locs = ray.experimental.get_object_locations([ref])
110
+ nodes = locs[ref]["node_ids"]
111
+ if nodes:
112
+ self._cached_location = nodes[0]
113
+ else:
114
+ self._cached_location = ""
115
+ if self._cached_location:
116
+ return self._cached_location
117
+ else:
118
+ return None # Return None if cached location is "".
119
+
120
+ def __eq__(self, other) -> bool:
121
+ return self is other
122
+
123
+ def __hash__(self) -> int:
124
+ return id(self)
125
+
126
+ def __len__(self) -> int:
127
+ return len(self.blocks)
128
+
129
+
130
+ def _ref_bundles_iterator_to_block_refs_list(
131
+ ref_bundles: Iterator[RefBundle],
132
+ ) -> List[ObjectRef[Block]]:
133
+ """Convert an iterator of RefBundles to a list of Block object references."""
134
+ return [
135
+ block_ref for ref_bundle in ref_bundles for block_ref in ref_bundle.block_refs
136
+ ]
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/task_context.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import TYPE_CHECKING, Any, Dict, Optional
3
+
4
+ from ray.data._internal.progress_bar import ProgressBar
5
+
6
+ if TYPE_CHECKING:
7
+ from ray.data._internal.execution.operators.map_transformer import MapTransformer
8
+
9
+
10
+ @dataclass
11
+ class TaskContext:
12
+ """This describes the information of a task running block transform."""
13
+
14
+ # The index of task. Each task has a unique task index within the same
15
+ # operator.
16
+ task_idx: int
17
+
18
+ # The dictionary of sub progress bar to update. The key is name of sub progress
19
+ # bar. Note this is only used on driver side.
20
+ # TODO(chengsu): clean it up from TaskContext with new optimizer framework.
21
+ sub_progress_bar_dict: Optional[Dict[str, ProgressBar]] = None
22
+
23
+ # NOTE(hchen): `upstream_map_transformer` and `upstream_map_ray_remote_args`
24
+ # are only used for `RandomShuffle`. DO NOT use them for other operators.
25
+ # Ideally, they should be handled by the optimizer, and should be transparent
26
+ # to the specific operators.
27
+ # But for `RandomShuffle`, the AllToAllOperator doesn't do the shuffle itself.
28
+ # It uses `ExchangeTaskScheduler` to launch new tasks to do the shuffle.
29
+ # That's why we need to pass them to `ExchangeTaskScheduler`.
30
+ # TODO(hchen): Use a physical operator to do the shuffle directly.
31
+
32
+ # The underlying function called in a MapOperator; this is used when fusing
33
+ # an AllToAllOperator with an upstream MapOperator.
34
+ upstream_map_transformer: Optional["MapTransformer"] = None
35
+
36
+ # The Ray remote arguments of the fused upstream MapOperator.
37
+ # This should be set if upstream_map_transformer is set.
38
+ upstream_map_ray_remote_args: Optional[Dict[str, Any]] = None
39
+
40
+ # The target maximum number of bytes to include in the task's output block.
41
+ target_max_block_size: Optional[int] = None
42
+
43
+ # Additional keyword arguments passed to the task.
44
+ kwargs: Dict[str, Any] = field(default_factory=dict)
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/transform_fn.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, List, Tuple
2
+
3
+ from .ref_bundle import RefBundle
4
+ from .task_context import TaskContext
5
+ from ray.data._internal.stats import StatsDict
6
+
7
+ # Block transform function applied in AllToAllOperator.
8
+ AllToAllTransformFn = Callable[
9
+ [List[RefBundle], TaskContext], Tuple[List[RefBundle], StatsDict]
10
+ ]
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/legacy_compat.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This file contains temporary helper functions for legacy plan/executor interaction.
2
+
3
+ It should be deleted once we fully move to the new executor backend.
4
+ """
5
+
6
+ from typing import Iterator, Optional, Tuple
7
+
8
+ from ray.data._internal.block_list import BlockList
9
+ from ray.data._internal.execution.interfaces import (
10
+ Executor,
11
+ PhysicalOperator,
12
+ RefBundle,
13
+ )
14
+ from ray.data._internal.execution.interfaces.executor import OutputIterator
15
+ from ray.data._internal.logical.optimizers import get_execution_plan
16
+ from ray.data._internal.logical.util import record_operators_usage
17
+ from ray.data._internal.plan import ExecutionPlan
18
+ from ray.data._internal.stats import DatasetStats
19
+ from ray.data._internal.util import unify_block_metadata_schema
20
+ from ray.data.block import BlockMetadata
21
+
22
+ # Warn about tasks larger than this.
23
+ TASK_SIZE_WARN_THRESHOLD_BYTES = 100000
24
+
25
+
26
+ def execute_to_legacy_bundle_iterator(
27
+ executor: Executor,
28
+ plan: ExecutionPlan,
29
+ dag_rewrite=None,
30
+ ) -> Iterator[RefBundle]:
31
+ """Execute a plan with the new executor and return a bundle iterator.
32
+
33
+ Args:
34
+ executor: The executor to use.
35
+ plan: The legacy plan to execute.
36
+ dag_rewrite: Callback that can be used to mutate the DAG prior to execution.
37
+ This is currently used as a legacy hack to inject the OutputSplit operator
38
+ for `Dataset.streaming_split()`.
39
+
40
+ Returns:
41
+ The output as a bundle iterator.
42
+ """
43
+ dag, stats = _get_execution_dag(
44
+ executor,
45
+ plan,
46
+ preserve_order=False,
47
+ )
48
+ if dag_rewrite:
49
+ dag = dag_rewrite(dag)
50
+
51
+ bundle_iter = executor.execute(dag, initial_stats=stats)
52
+
53
+ class CacheMetadataIterator(OutputIterator):
54
+ """Wrapper for `bundle_iterator` above.
55
+
56
+ For a given iterator which yields output RefBundles,
57
+ collect the metadata from each output bundle, and yield the
58
+ original RefBundle. Only after the entire iterator is exhausted,
59
+ we cache the resulting metadata to the execution plan."""
60
+
61
+ def __init__(self, base_iterator: OutputIterator):
62
+ # Note: the base_iterator should be of type StreamIterator,
63
+ # defined within `StreamingExecutor.execute()`. It must
64
+ # support the `get_next()` method.
65
+ self._base_iterator = base_iterator
66
+ self._collected_metadata = BlockMetadata(
67
+ num_rows=0,
68
+ size_bytes=0,
69
+ schema=None,
70
+ input_files=None,
71
+ exec_stats=None,
72
+ )
73
+
74
+ def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle:
75
+ try:
76
+ bundle = self._base_iterator.get_next(output_split_idx)
77
+ self._collect_metadata(bundle)
78
+ return bundle
79
+ except StopIteration:
80
+ # Once the iterator is completely exhausted, we are done
81
+ # collecting metadata. We can add this cached metadata to the plan.
82
+ plan._snapshot_metadata = self._collected_metadata
83
+ raise
84
+
85
+ def _collect_metadata(self, bundle: RefBundle) -> RefBundle:
86
+ """Collect the metadata from each output bundle and accumulate
87
+ results, so we can access important information, such as
88
+ row count, schema, etc., after iteration completes."""
89
+ self._collected_metadata.num_rows += bundle.num_rows()
90
+ self._collected_metadata.size_bytes += bundle.size_bytes()
91
+ self._collected_metadata.schema = unify_block_metadata_schema(
92
+ [self._collected_metadata, *bundle.metadata]
93
+ )
94
+ return bundle
95
+
96
+ bundle_iter = CacheMetadataIterator(bundle_iter)
97
+ return bundle_iter
98
+
99
+
100
+ def execute_to_legacy_block_list(
101
+ executor: Executor,
102
+ plan: ExecutionPlan,
103
+ dataset_uuid: str,
104
+ preserve_order: bool,
105
+ ) -> BlockList:
106
+ """Execute a plan with the new executor and translate it into a legacy block list.
107
+
108
+ Args:
109
+ executor: The executor to use.
110
+ plan: The legacy plan to execute.
111
+ dataset_uuid: UUID of the dataset for this execution.
112
+ preserve_order: Whether to preserve order in execution.
113
+
114
+ Returns:
115
+ The output as a legacy block list.
116
+ """
117
+ dag, stats = _get_execution_dag(
118
+ executor,
119
+ plan,
120
+ preserve_order,
121
+ )
122
+ bundles = executor.execute(dag, initial_stats=stats)
123
+ block_list = _bundles_to_block_list(bundles)
124
+ # Set the stats UUID after execution finishes.
125
+ _set_stats_uuid_recursive(executor.get_stats(), dataset_uuid)
126
+ return block_list
127
+
128
+
129
+ def _get_execution_dag(
130
+ executor: Executor,
131
+ plan: ExecutionPlan,
132
+ preserve_order: bool,
133
+ ) -> Tuple[PhysicalOperator, DatasetStats]:
134
+ """Get the physical operators DAG from a plan."""
135
+ # Record usage of logical operators if available.
136
+ if hasattr(plan, "_logical_plan") and plan._logical_plan is not None:
137
+ record_operators_usage(plan._logical_plan.dag)
138
+
139
+ # Get DAG of physical operators and input statistics.
140
+ dag = get_execution_plan(plan._logical_plan).dag
141
+ stats = _get_initial_stats_from_plan(plan)
142
+
143
+ # Enforce to preserve ordering if the plan has operators
144
+ # required to do so, such as Zip and Sort.
145
+ if preserve_order or plan.require_preserve_order():
146
+ executor._options.preserve_order = True
147
+
148
+ return dag, stats
149
+
150
+
151
+ def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatasetStats:
152
+ if plan._snapshot_bundle is not None:
153
+ return plan._snapshot_stats
154
+ # For Datasets created from "read_xxx", `plan._in_stats` contains useless data.
155
+ # For Datasets created from "from_xxx", we need to use `plan._in_stats` as
156
+ # the initial stats. Because the `FromXxx` logical operators will be translated to
157
+ # "InputDataBuffer" physical operators, which will be ignored when generating
158
+ # stats, see `StreamingExecutor._generate_stats`.
159
+ # TODO(hchen): Unify the logic by saving the initial stats in `InputDataBuffer
160
+ if plan.has_lazy_input():
161
+ return DatasetStats(metadata={}, parent=None)
162
+ else:
163
+ return plan._in_stats
164
+
165
+
166
+ def _bundles_to_block_list(bundles: Iterator[RefBundle]) -> BlockList:
167
+ blocks, metadata = [], []
168
+ owns_blocks = True
169
+ for ref_bundle in bundles:
170
+ if not ref_bundle.owns_blocks:
171
+ owns_blocks = False
172
+ blocks.extend(ref_bundle.block_refs)
173
+ metadata.extend(ref_bundle.metadata)
174
+ return BlockList(blocks, metadata, owned_by_consumer=owns_blocks)
175
+
176
+
177
+ def _set_stats_uuid_recursive(stats: DatasetStats, dataset_uuid: str) -> None:
178
+ if not stats.dataset_uuid:
179
+ stats.dataset_uuid = dataset_uuid
180
+ for parent in stats.parents or []:
181
+ _set_stats_uuid_recursive(parent, dataset_uuid)
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (211 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/actor_pool_map_operator.cpython-311.pyc ADDED
Binary file (40.5 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/aggregate_num_rows.cpython-311.pyc ADDED
Binary file (3.69 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/base_physical_operator.cpython-311.pyc ADDED
Binary file (10.1 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/input_data_buffer.cpython-311.pyc ADDED
Binary file (4.95 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/limit_operator.cpython-311.pyc ADDED
Binary file (7.58 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_operator.cpython-311.pyc ADDED
Binary file (36.8 kB). View file
 
.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/__pycache__/map_transformer.cpython-311.pyc ADDED
Binary file (24.7 kB). View file