koichi12 commited on Feb 12, 2025

Commit

98ca408

verified ·

1 Parent(s): 80c179b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/data/__init__.py +165 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/aggregate.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/block.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/context.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/exceptions.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/grouped_data.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/iterator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/preprocessor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/__pycache__/random_access_dataset.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/data/aggregate.py +76 -0
.venv/lib/python3.11/site-packages/ray/data/block.py +561 -0
.venv/lib/python3.11/site-packages/ray/data/context.py +468 -0
.venv/lib/python3.11/site-packages/ray/data/dataset.py +0 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/datasink.py +164 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/datasource.py +243 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/file_based_datasource.py +572 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/file_meta_provider.py +484 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/filename_provider.py +122 -0
.venv/lib/python3.11/site-packages/ray/data/datasource/parquet_meta_provider.py +252 -0
.venv/lib/python3.11/site-packages/ray/data/exceptions.py +91 -0
.venv/lib/python3.11/site-packages/ray/data/grouped_data.py +494 -0
.venv/lib/python3.11/site-packages/ray/data/iterator.py +931 -0
.venv/lib/python3.11/site-packages/ray/data/preprocessor.py +318 -0
.venv/lib/python3.11/site-packages/ray/data/random_access_dataset.py +293 -0
.venv/lib/python3.11/site-packages/ray/data/read_api.py +0 -0
.venv/lib/python3.11/site-packages/ray/includes/__init__.pxd +0 -0
.venv/lib/python3.11/site-packages/ray/includes/common.pxd +749 -0
.venv/lib/python3.11/site-packages/ray/includes/function_descriptor.pxd +80 -0
.venv/lib/python3.11/site-packages/ray/includes/global_state_accessor.pxd +144 -0
.venv/lib/python3.11/site-packages/ray/includes/libcoreworker.pxd +457 -0
.venv/lib/python3.11/site-packages/ray/includes/metric.pxd +45 -0
.venv/lib/python3.11/site-packages/ray/includes/optional.pxd +36 -0
.venv/lib/python3.11/site-packages/ray/includes/ray_config.pxd +98 -0
.venv/lib/python3.11/site-packages/ray/includes/unique_ids.pxd +218 -0
.venv/lib/python3.11/site-packages/ray/runtime_env/__init__.py +8 -0
.venv/lib/python3.11/site-packages/ray/runtime_env/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/runtime_env/__pycache__/runtime_env.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/runtime_env/runtime_env.py +662 -0
.venv/lib/python3.11/site-packages/ray/widgets/__init__.py +4 -0
.venv/lib/python3.11/site-packages/ray/widgets/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/widgets/__pycache__/render.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/widgets/__pycache__/util.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/widgets/render.py +39 -0
.venv/lib/python3.11/site-packages/ray/widgets/templates/context.html.j2 +6 -0
.venv/lib/python3.11/site-packages/ray/widgets/templates/context_dashrow.html.j2 +4 -0
.venv/lib/python3.11/site-packages/ray/widgets/templates/context_logo.html.j2 +13 -0
.venv/lib/python3.11/site-packages/ray/widgets/templates/context_table.html.j2 +11 -0
.venv/lib/python3.11/site-packages/ray/widgets/templates/divider.html.j2 +9 -0
.venv/lib/python3.11/site-packages/ray/widgets/templates/rendered_html_common.html.j2 +3 -0

.venv/lib/python3.11/site-packages/ray/data/__init__.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Short term workaround for https://github.com/ray-project/ray/issues/32435
+# Dataset has a hard dependency on pandas, so it doesn't need to be delayed.
+import pandas  # noqa
+from packaging.version import parse as parse_version
+from ray._private.utils import _get_pyarrow_version
+from ray.data._internal.compute import ActorPoolStrategy
+from ray.data._internal.datasource.tfrecords_datasource import TFXReadOptions
+from ray.data._internal.execution.interfaces import (
+    ExecutionOptions,
+    ExecutionResources,
+    NodeIdStr,
+)
+from ray.data._internal.logging import configure_logging
+from ray.data.context import DataContext, DatasetContext
+from ray.data.dataset import Dataset, Schema
+from ray.data.datasource import (
+    BlockBasedFileDatasink,
+    Datasink,
+    Datasource,
+    FileShuffleConfig,
+    ReadTask,
+    RowBasedFileDatasink,
+)
+from ray.data.iterator import DataIterator, DatasetIterator
+from ray.data.preprocessor import Preprocessor
+from ray.data.read_api import (  # noqa: F401
+    from_arrow,
+    from_arrow_refs,
+    from_blocks,
+    from_dask,
+    from_huggingface,
+    from_items,
+    from_mars,
+    from_modin,
+    from_numpy,
+    from_numpy_refs,
+    from_pandas,
+    from_pandas_refs,
+    from_spark,
+    from_tf,
+    from_torch,
+    range,
+    range_tensor,
+    read_audio,
+    read_avro,
+    read_bigquery,
+    read_binary_files,
+    read_clickhouse,
+    read_csv,
+    read_databricks_tables,
+    read_datasource,
+    read_delta_sharing_tables,
+    read_hudi,
+    read_iceberg,
+    read_images,
+    read_json,
+    read_lance,
+    read_mongo,
+    read_numpy,
+    read_parquet,
+    read_parquet_bulk,
+    read_sql,
+    read_text,
+    read_tfrecords,
+    read_videos,
+    read_webdataset,
+)
+# Module-level cached global functions for callable classes. It needs to be defined here
+# since it has to be process-global across cloudpickled funcs.
+_map_actor_context = None
+configure_logging()
+try:
+    import pyarrow as pa
+    # https://github.com/apache/arrow/pull/38608 deprecated `PyExtensionType`, and
+    # disabled it's deserialization by default. To ensure that users can load data
+    # written with earlier version of Ray Data, we enable auto-loading of serialized
+    # tensor extensions.
+    pyarrow_version = _get_pyarrow_version()
+    if not isinstance(pyarrow_version, str):
+        # PyArrow is mocked in documentation builds. In this case, we don't need to do
+        # anything.
+        pass
+    else:
+        from ray._private.ray_constants import env_bool
+        RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE = env_bool(
+            "RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE", False
+        )
+        if (
+            parse_version(pyarrow_version) >= parse_version("14.0.1")
+            and RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE
+        ):
+            pa.PyExtensionType.set_auto_load(True)
+        # Import these arrow extension types to ensure that they are registered.
+        from ray.air.util.tensor_extensions.arrow import (  # noqa
+            ArrowTensorType,
+            ArrowVariableShapedTensorType,
+        )
+except ModuleNotFoundError:
+    pass
+__all__ = [
+    "ActorPoolStrategy",
+    "BlockBasedFileDatasink",
+    "Dataset",
+    "DataContext",
+    "DatasetContext",  # Backwards compatibility alias.
+    "DataIterator",
+    "DatasetIterator",  # Backwards compatibility alias.
+    "Datasink",
+    "Datasource",
+    "ExecutionOptions",
+    "ExecutionResources",
+    "FileShuffleConfig",
+    "NodeIdStr",
+    "ReadTask",
+    "RowBasedFileDatasink",
+    "Schema",
+    "from_dask",
+    "from_items",
+    "from_arrow",
+    "from_arrow_refs",
+    "from_mars",
+    "from_modin",
+    "from_numpy",
+    "from_numpy_refs",
+    "from_pandas",
+    "from_pandas_refs",
+    "from_spark",
+    "from_tf",
+    "from_torch",
+    "from_huggingface",
+    "range",
+    "range_tensor",
+    "read_audio",
+    "read_avro",
+    "read_text",
+    "read_binary_files",
+    "read_clickhouse",
+    "read_csv",
+    "read_datasource",
+    "read_delta_sharing_tables",
+    "read_hudi",
+    "read_iceberg",
+    "read_images",
+    "read_json",
+    "read_lance",
+    "read_numpy",
+    "read_mongo",
+    "read_parquet",
+    "read_parquet_bulk",
+    "read_sql",
+    "read_tfrecords",
+    "read_videos",
+    "read_webdataset",
+    "Preprocessor",
+    "TFXReadOptions",
+]

.venv/lib/python3.11/site-packages/ray/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (3.97 kB). View file

.venv/lib/python3.11/site-packages/ray/data/__pycache__/aggregate.cpython-311.pyc ADDED Viewed

Binary file (4.36 kB). View file

.venv/lib/python3.11/site-packages/ray/data/__pycache__/block.cpython-311.pyc ADDED Viewed

Binary file (25.7 kB). View file

.venv/lib/python3.11/site-packages/ray/data/__pycache__/context.cpython-311.pyc ADDED Viewed

Binary file (19.8 kB). View file

.venv/lib/python3.11/site-packages/ray/data/__pycache__/exceptions.cpython-311.pyc ADDED Viewed

Binary file (4.6 kB). View file

.venv/lib/python3.11/site-packages/ray/data/__pycache__/grouped_data.cpython-311.pyc ADDED Viewed

Binary file (24.4 kB). View file

.venv/lib/python3.11/site-packages/ray/data/__pycache__/iterator.cpython-311.pyc ADDED Viewed

Binary file (46.5 kB). View file

.venv/lib/python3.11/site-packages/ray/data/__pycache__/preprocessor.cpython-311.pyc ADDED Viewed

Binary file (14.5 kB). View file

.venv/lib/python3.11/site-packages/ray/data/__pycache__/random_access_dataset.cpython-311.pyc ADDED Viewed

Binary file (19.8 kB). View file

.venv/lib/python3.11/site-packages/ray/data/aggregate.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from typing import TYPE_CHECKING, Callable, Optional, Union
+from ray.data.block import AggType, Block, BlockAccessor, KeyType, T, U
+from ray.util.annotations import PublicAPI
+if TYPE_CHECKING:
+    import pyarrow as pa
+@PublicAPI
+class AggregateFn:
+    """Defines an aggregate function in the accumulator style.
+    Aggregates a collection of inputs of type T into
+    a single output value of type U.
+    See https://www.sigops.org/s/conferences/sosp/2009/papers/yu-sosp09.pdf
+    for more details about accumulator-based aggregation.
+    Args:
+        init: This is called once for each group to return the empty accumulator.
+            For example, an empty accumulator for a sum would be 0.
+        merge: This may be called multiple times, each time to merge
+            two accumulators into one.
+        name: The name of the aggregation. This will be used as the column name
+            in the output Dataset.
+        accumulate_row: This is called once per row of the same group.
+            This combines the accumulator and the row, returns the updated
+            accumulator. Exactly one of accumulate_row and accumulate_block must
+            be provided.
+        accumulate_block: This is used to calculate the aggregation for a
+            single block, and is vectorized alternative to accumulate_row. This will
+            be given a base accumulator and the entire block, allowing for
+            vectorized accumulation of the block. Exactly one of accumulate_row and
+            accumulate_block must be provided.
+        finalize: This is called once to compute the final aggregation
+            result from the fully merged accumulator.
+    """
+    def __init__(
+        self,
+        init: Callable[[KeyType], AggType],
+        merge: Callable[[AggType, AggType], AggType],
+        name: str,
+        accumulate_row: Callable[[AggType, T], AggType] = None,
+        accumulate_block: Callable[[AggType, Block], AggType] = None,
+        finalize: Optional[Callable[[AggType], U]] = None,
+    ):
+        if (accumulate_row is None and accumulate_block is None) or (
+            accumulate_row is not None and accumulate_block is not None
+        ):
+            raise ValueError(
+                "Exactly one of accumulate_row or accumulate_block must be provided."
+            )
+        if accumulate_block is None:
+            def accumulate_block(a: AggType, block: Block) -> AggType:
+                block_acc = BlockAccessor.for_block(block)
+                for r in block_acc.iter_rows(public_row_format=False):
+                    a = accumulate_row(a, r)
+                return a
+        if not isinstance(name, str):
+            raise TypeError("`name` must be provided.")
+        if finalize is None:
+            finalize = lambda a: a  # noqa: E731
+        self.init = init
+        self.merge = merge
+        self.name = name
+        self.accumulate_block = accumulate_block
+        self.finalize = finalize
+    def _validate(self, schema: Optional[Union[type, "pa.lib.Schema"]]) -> None:
+        """Raise an error if this cannot be applied to the given schema."""
+        pass

.venv/lib/python3.11/site-packages/ray/data/block.py ADDED Viewed

	@@ -0,0 +1,561 @@

+import collections
+import logging
+import os
+import time
+from dataclasses import dataclass
+from enum import Enum
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import numpy as np
+import ray
+from ray import DynamicObjectRefGenerator
+from ray.air.util.tensor_extensions.arrow import ArrowConversionError
+from ray.data._internal.util import _check_pyarrow_version, _truncated_repr
+from ray.types import ObjectRef
+from ray.util import log_once
+from ray.util.annotations import DeveloperAPI
+import psutil
+try:
+    import resource
+except ImportError:
+    resource = None
+if TYPE_CHECKING:
+    import pandas
+    import pyarrow
+    from ray.data._internal.block_builder import BlockBuilder
+    from ray.data._internal.planner.exchange.sort_task_spec import SortKey
+    from ray.data.aggregate import AggregateFn
+T = TypeVar("T", contravariant=True)
+U = TypeVar("U", covariant=True)
+KeyType = TypeVar("KeyType")
+AggType = TypeVar("AggType")
+# Represents a batch of records to be stored in the Ray object store.
+#
+# Block data can be accessed in a uniform way via ``BlockAccessors`` like`
+# ``ArrowBlockAccessor``.
+Block = Union["pyarrow.Table", "pandas.DataFrame"]
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class BlockType(Enum):
+    ARROW = "arrow"
+    PANDAS = "pandas"
+# User-facing data batch type. This is the data type for data that is supplied to and
+# returned from batch UDFs.
+DataBatch = Union["pyarrow.Table", "pandas.DataFrame", Dict[str, np.ndarray]]
+# User-facing data column type. This is the data type for data that is supplied to and
+# returned from column UDFs.
+DataBatchColumn = Union[
+    "pyarrow.ChunkedArray", "pyarrow.Array", "pandas.Series", np.ndarray
+]
+# A class type that implements __call__.
+CallableClass = type
+class _CallableClassProtocol(Protocol[T, U]):
+    def __call__(self, __arg: T) -> Union[U, Iterator[U]]:
+        ...
+# A user defined function passed to map, map_batches, ec.
+UserDefinedFunction = Union[
+    Callable[[T], U],
+    Callable[[T], Iterator[U]],
+    "_CallableClassProtocol",
+]
+# A list of block references pending computation by a single task. For example,
+# this may be the output of a task reading a file.
+BlockPartition = List[Tuple[ObjectRef[Block], "BlockMetadata"]]
+# The metadata that describes the output of a BlockPartition. This has the
+# same type as the metadata that describes each block in the partition.
+BlockPartitionMetadata = List["BlockMetadata"]
+# TODO(ekl/chengsu): replace this with just
+# `DynamicObjectRefGenerator` once block splitting
+# is on by default. When block splitting is off, the type is a plain block.
+MaybeBlockPartition = Union[Block, DynamicObjectRefGenerator]
+VALID_BATCH_FORMATS = ["pandas", "pyarrow", "numpy", None]
+DEFAULT_BATCH_FORMAT = "numpy"
+def _apply_batch_format(given_batch_format: Optional[str]) -> str:
+    if given_batch_format == "default":
+        given_batch_format = DEFAULT_BATCH_FORMAT
+    if given_batch_format not in VALID_BATCH_FORMATS:
+        raise ValueError(
+            f"The given batch format {given_batch_format} isn't allowed (must be one of"
+            f" {VALID_BATCH_FORMATS})."
+        )
+    return given_batch_format
+def _apply_batch_size(
+    given_batch_size: Optional[Union[int, Literal["default"]]]
+) -> Optional[int]:
+    if given_batch_size == "default":
+        return ray.data.context.DEFAULT_BATCH_SIZE
+    else:
+        return given_batch_size
+@DeveloperAPI
+class BlockExecStats:
+    """Execution stats for this block.
+    Attributes:
+        wall_time_s: The wall-clock time it took to compute this block.
+        cpu_time_s: The CPU time it took to compute this block.
+        node_id: A unique id for the node that computed this block.
+    """
+    def __init__(self):
+        self.start_time_s: Optional[float] = None
+        self.end_time_s: Optional[float] = None
+        self.wall_time_s: Optional[float] = None
+        self.udf_time_s: Optional[float] = 0
+        self.cpu_time_s: Optional[float] = None
+        self.node_id = ray.runtime_context.get_runtime_context().get_node_id()
+        # Max memory usage. May be an overestimate since we do not
+        # differentiate from previous tasks on the same worker.
+        self.max_rss_bytes: int = 0
+        self.task_idx: Optional[int] = None
+    @staticmethod
+    def builder() -> "_BlockExecStatsBuilder":
+        return _BlockExecStatsBuilder()
+    def __repr__(self):
+        return repr(
+            {
+                "wall_time_s": self.wall_time_s,
+                "cpu_time_s": self.cpu_time_s,
+                "udf_time_s": self.udf_time_s,
+                "node_id": self.node_id,
+            }
+        )
+class _BlockExecStatsBuilder:
+    """Helper class for building block stats.
+    When this class is created, we record the start time. When build() is
+    called, the time delta is saved as part of the stats.
+    """
+    def __init__(self):
+        self.start_time = time.perf_counter()
+        self.start_cpu = time.process_time()
+    def build(self) -> "BlockExecStats":
+        self.end_time = time.perf_counter()
+        self.end_cpu = time.process_time()
+        stats = BlockExecStats()
+        stats.start_time_s = self.start_time
+        stats.end_time_s = self.end_time
+        stats.wall_time_s = self.end_time - self.start_time
+        stats.cpu_time_s = self.end_cpu - self.start_cpu
+        if resource is None:
+            # NOTE(swang): resource package is not supported on Windows. This
+            # is only the memory usage at the end of the task, not the peak
+            # memory.
+            process = psutil.Process(os.getpid())
+            stats.max_rss_bytes = int(process.memory_info().rss)
+        else:
+            stats.max_rss_bytes = int(
+                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1e3
+            )
+        return stats
+@DeveloperAPI
+@dataclass
+class BlockMetadata:
+    """Metadata about the block."""
+    #: The number of rows contained in this block, or None.
+    num_rows: Optional[int]
+    #: The approximate size in bytes of this block, or None.
+    size_bytes: Optional[int]
+    #: The pyarrow schema or types of the block elements, or None.
+    schema: Optional[Union[type, "pyarrow.lib.Schema"]]
+    #: The list of file paths used to generate this block, or
+    #: the empty list if indeterminate.
+    input_files: Optional[List[str]]
+    #: Execution stats for this block.
+    exec_stats: Optional[BlockExecStats]
+    def __post_init__(self):
+        if self.input_files is None:
+            self.input_files = []
+        if self.size_bytes is not None:
+            # Require size_bytes to be int, ray.util.metrics objects
+            # will not take other types like numpy.int64
+            assert isinstance(self.size_bytes, int)
+@DeveloperAPI
+class BlockAccessor:
+    """Provides accessor methods for a specific block.
+    Ideally, we wouldn't need a separate accessor classes for blocks. However,
+    this is needed if we want to support storing ``pyarrow.Table`` directly
+    as a top-level Ray object, without a wrapping class (issue #17186).
+    """
+    def num_rows(self) -> int:
+        """Return the number of rows contained in this block."""
+        raise NotImplementedError
+    def iter_rows(self, public_row_format: bool) -> Iterator[T]:
+        """Iterate over the rows of this block.
+        Args:
+            public_row_format: Whether to cast rows into the public Dict row
+                format (this incurs extra copy conversions).
+        """
+        raise NotImplementedError
+    def slice(self, start: int, end: int, copy: bool) -> Block:
+        """Return a slice of this block.
+        Args:
+            start: The starting index of the slice (inclusive).
+            end: The ending index of the slice (exclusive).
+            copy: Whether to perform a data copy for the slice.
+        Returns:
+            The sliced block result.
+        """
+        raise NotImplementedError
+    def take(self, indices: List[int]) -> Block:
+        """Return a new block containing the provided row indices.
+        Args:
+            indices: The row indices to return.
+        Returns:
+            A new block containing the provided row indices.
+        """
+        raise NotImplementedError
+    def select(self, columns: List[Optional[str]]) -> Block:
+        """Return a new block containing the provided columns."""
+        raise NotImplementedError
+    def rename_columns(self, columns_rename: Dict[str, str]) -> Block:
+        """Return the block reflecting the renamed columns."""
+        raise NotImplementedError
+    def random_shuffle(self, random_seed: Optional[int]) -> Block:
+        """Randomly shuffle this block."""
+        raise NotImplementedError
+    def to_pandas(self) -> "pandas.DataFrame":
+        """Convert this block into a Pandas dataframe."""
+        raise NotImplementedError
+    def to_numpy(
+        self, columns: Optional[Union[str, List[str]]] = None
+    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
+        """Convert this block (or columns of block) into a NumPy ndarray.
+        Args:
+            columns: Name of columns to convert, or None if converting all columns.
+        """
+        raise NotImplementedError
+    def to_arrow(self) -> "pyarrow.Table":
+        """Convert this block into an Arrow table."""
+        raise NotImplementedError
+    def to_block(self) -> Block:
+        """Return the base block that this accessor wraps."""
+        raise NotImplementedError
+    def to_default(self) -> Block:
+        """Return the default data format for this accessor."""
+        return self.to_block()
+    def to_batch_format(self, batch_format: Optional[str]) -> DataBatch:
+        """Convert this block into the provided batch format.
+        Args:
+            batch_format: The batch format to convert this block to.
+        Returns:
+            This block formatted as the provided batch format.
+        """
+        if batch_format is None:
+            return self.to_block()
+        elif batch_format == "default" or batch_format == "native":
+            return self.to_default()
+        elif batch_format == "pandas":
+            return self.to_pandas()
+        elif batch_format == "pyarrow":
+            return self.to_arrow()
+        elif batch_format == "numpy":
+            return self.to_numpy()
+        else:
+            raise ValueError(
+                f"The batch format must be one of {VALID_BATCH_FORMATS}, got: "
+                f"{batch_format}"
+            )
+    def size_bytes(self) -> int:
+        """Return the approximate size in bytes of this block."""
+        raise NotImplementedError
+    def schema(self) -> Union[type, "pyarrow.lib.Schema"]:
+        """Return the Python type or pyarrow schema of this block."""
+        raise NotImplementedError
+    def get_metadata(
+        self,
+        input_files: Optional[List[str]] = None,
+        exec_stats: Optional[BlockExecStats] = None,
+    ) -> BlockMetadata:
+        """Create a metadata object from this block."""
+        return BlockMetadata(
+            num_rows=self.num_rows(),
+            size_bytes=self.size_bytes(),
+            schema=self.schema(),
+            input_files=input_files,
+            exec_stats=exec_stats,
+        )
+    def zip(self, other: "Block") -> "Block":
+        """Zip this block with another block of the same type and size."""
+        raise NotImplementedError
+    @staticmethod
+    def builder() -> "BlockBuilder":
+        """Create a builder for this block type."""
+        raise NotImplementedError
+    @classmethod
+    def batch_to_block(
+        cls,
+        batch: DataBatch,
+        block_type: Optional[BlockType] = None,
+    ) -> Block:
+        """Create a block from user-facing data formats."""
+        if isinstance(batch, np.ndarray):
+            raise ValueError(
+                f"Error validating {_truncated_repr(batch)}: "
+                "Standalone numpy arrays are not "
+                "allowed in Ray 2.5. Return a dict of field -> array, "
+                "e.g., `{'data': array}` instead of `array`."
+            )
+        elif isinstance(batch, collections.abc.Mapping):
+            if block_type is None or block_type == BlockType.ARROW:
+                try:
+                    return cls.batch_to_arrow_block(batch)
+                except ArrowConversionError as e:
+                    if log_once("_fallback_to_pandas_block_warning"):
+                        logger.warning(
+                            f"Failed to convert batch to Arrow due to: {e}; "
+                            f"falling back to Pandas block"
+                        )
+                    if block_type is None:
+                        return cls.batch_to_pandas_block(batch)
+                    else:
+                        raise e
+            else:
+                assert block_type == BlockType.PANDAS
+                return cls.batch_to_pandas_block(batch)
+        return batch
+    @classmethod
+    def batch_to_arrow_block(cls, batch: Dict[str, Any]) -> Block:
+        """Create an Arrow block from user-facing data formats."""
+        from ray.data._internal.arrow_block import ArrowBlockBuilder
+        return ArrowBlockBuilder._table_from_pydict(batch)
+    @classmethod
+    def batch_to_pandas_block(cls, batch: Dict[str, Any]) -> Block:
+        """Create a Pandas block from user-facing data formats."""
+        from ray.data._internal.pandas_block import PandasBlockAccessor
+        return PandasBlockAccessor.numpy_to_block(batch)
+    @staticmethod
+    def for_block(block: Block) -> "BlockAccessor[T]":
+        """Create a block accessor for the given block."""
+        _check_pyarrow_version()
+        import pandas
+        import pyarrow
+        if isinstance(block, pyarrow.Table):
+            from ray.data._internal.arrow_block import ArrowBlockAccessor
+            return ArrowBlockAccessor(block)
+        elif isinstance(block, pandas.DataFrame):
+            from ray.data._internal.pandas_block import PandasBlockAccessor
+            return PandasBlockAccessor(block)
+        elif isinstance(block, bytes):
+            from ray.data._internal.arrow_block import ArrowBlockAccessor
+            return ArrowBlockAccessor.from_bytes(block)
+        elif isinstance(block, list):
+            raise ValueError(
+                f"Error validating {_truncated_repr(block)}: "
+                "Standalone Python objects are not "
+                "allowed in Ray 2.5. To use Python objects in a dataset, "
+                "wrap them in a dict of numpy arrays, e.g., "
+                "return `{'item': batch}` instead of just `batch`."
+            )
+        else:
+            raise TypeError("Not a block type: {} ({})".format(block, type(block)))
+    def sample(self, n_samples: int, sort_key: "SortKey") -> "Block":
+        """Return a random sample of items from this block."""
+        raise NotImplementedError
+    def sort_and_partition(
+        self, boundaries: List[T], sort_key: "SortKey"
+    ) -> List["Block"]:
+        """Return a list of sorted partitions of this block."""
+        raise NotImplementedError
+    def combine(self, key: "SortKey", aggs: Tuple["AggregateFn"]) -> Block:
+        """Combine rows with the same key into an accumulator."""
+        raise NotImplementedError
+    @staticmethod
+    def merge_sorted_blocks(
+        blocks: List["Block"], sort_key: "SortKey"
+    ) -> Tuple[Block, BlockMetadata]:
+        """Return a sorted block by merging a list of sorted blocks."""
+        raise NotImplementedError
+    @staticmethod
+    def aggregate_combined_blocks(
+        blocks: List[Block], sort_key: "SortKey", aggs: Tuple["AggregateFn"]
+    ) -> Tuple[Block, BlockMetadata]:
+        """Aggregate partially combined and sorted blocks."""
+        raise NotImplementedError
+    def block_type(self) -> BlockType:
+        """Return the block type of this block."""
+        raise NotImplementedError
+def _get_block_boundaries(columns: list[np.ndarray]) -> np.ndarray:
+    """Compute boundaries of the groups within a block, which is represented
+    by a list of 1D numpy arrays for each column. In each column,
+    NaNs/None are considered to be the same group.
+    Args:
+        columns: a list of 1D numpy arrays. This is generally given by the
+        dictionary values of ``BlockAccessor.to_numpy()``.
+    Returns:
+        A list of starting indices of each group and an end index of the last
+        group, i.e., there are ``num_groups + 1`` entries and the first and last
+        entries are 0 and ``len(array)`` respectively.
+    """
+    # There are 3 categories: general, numerics with NaN, and categorical with None.
+    # We only needed to check the last element for NaNs/None, as they are assumed to
+    # be sorted.
+    general_arrays = []
+    num_arrays_with_nan = []
+    cat_arrays_with_none = []
+    for arr in columns:
+        if np.issubdtype(arr.dtype, np.number) and np.isnan(arr[-1]):
+            num_arrays_with_nan.append(arr)
+        elif not np.issubdtype(arr.dtype, np.number) and arr[-1] is None:
+            cat_arrays_with_none.append(arr)
+        else:
+            general_arrays.append(arr)
+    # Compute the difference between each pair of elements. Handle the cases
+    # where neighboring elements are both NaN or None. Output as a list of
+    # boolean arrays.
+    diffs = []
+    if len(general_arrays) > 0:
+        diffs.append(
+            np.vstack([arr[1:] != arr[:-1] for arr in general_arrays]).any(axis=0)
+        )
+    if len(num_arrays_with_nan) > 0:
+        # Two neighboring numeric elements belong to the same group when they are
+        # 1) both finite and equal
+        # or 2) both np.nan
+        diffs.append(
+            np.vstack(
+                [
+                    (arr[1:] != arr[:-1])
+                    & (np.isfinite(arr[1:]) | np.isfinite(arr[:-1]))
+                    for arr in num_arrays_with_nan
+                ]
+            ).any(axis=0)
+        )
+    if len(cat_arrays_with_none) > 0:
+        # Two neighboring str/object elements belong to the same group when they are
+        # 1) both finite and equal
+        # or 2) both None
+        diffs.append(
+            np.vstack(
+                [
+                    (arr[1:] != arr[:-1])
+                    & ~(np.equal(arr[1:], None) & np.equal(arr[:-1], None))
+                    for arr in cat_arrays_with_none
+                ]
+            ).any(axis=0)
+        )
+    # A series of vectorized operations to compute the boundaries:
+    # - column_stack: stack the bool arrays into a single 2D bool array
+    # - any() and nonzero(): find the indices where any of the column diffs are True
+    # - add 1 to get the index of the first element of the next group
+    # - hstack(): include the 0 and last indices to the boundaries
+    boundaries = np.hstack(
+        [
+            [0],
+            (np.column_stack(diffs).any(axis=1).nonzero()[0] + 1),
+            [len(columns[0])],
+        ]
+    ).astype(int)
+    return boundaries

.venv/lib/python3.11/site-packages/ray/data/context.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import logging
+import os
+import threading
+import warnings
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+import ray
+from ray._private.ray_constants import env_bool, env_integer
+from ray._private.worker import WORKER_MODE
+from ray.util.annotations import DeveloperAPI
+from ray.util.debug import log_once
+from ray.util.scheduling_strategies import SchedulingStrategyT
+if TYPE_CHECKING:
+    from ray.data._internal.execution.interfaces import ExecutionOptions
+logger = logging.getLogger(__name__)
+# The context singleton on this process.
+_default_context: "Optional[DataContext]" = None
+_context_lock = threading.Lock()
+# We chose 128MiB for default: With streaming execution and num_cpus many concurrent
+# tasks, the memory footprint will be about 2 * num_cpus * target_max_block_size ~= RAM
+# * DEFAULT_OBJECT_STORE_MEMORY_LIMIT_FRACTION * 0.3 (default object store memory
+# fraction set by Ray core), assuming typical memory:core ratio of 4:1.
+DEFAULT_TARGET_MAX_BLOCK_SIZE = 128 * 1024 * 1024
+# We set a higher target block size because we have to materialize
+# all input blocks anyway, so there is no performance advantage to having
+# smaller blocks. Setting a larger block size allows avoiding overhead from an
+# excessive number of partitions.
+# We choose 1GiB as 4x less than the typical memory:core ratio (4:1).
+DEFAULT_SHUFFLE_TARGET_MAX_BLOCK_SIZE = 1024 * 1024 * 1024
+# We will attempt to slice blocks whose size exceeds this factor *
+# target_max_block_size. We will warn the user if slicing fails and we produce
+# blocks larger than this threshold.
+MAX_SAFE_BLOCK_SIZE_FACTOR = 1.5
+DEFAULT_TARGET_MIN_BLOCK_SIZE = 1 * 1024 * 1024
+# This default appears to work well with most file sizes on remote storage systems,
+# which is very sensitive to the buffer size.
+DEFAULT_STREAMING_READ_BUFFER_SIZE = 32 * 1024 * 1024
+DEFAULT_ENABLE_PANDAS_BLOCK = True
+DEFAULT_READ_OP_MIN_NUM_BLOCKS = 200
+DEFAULT_ACTOR_PREFETCHER_ENABLED = False
+DEFAULT_USE_PUSH_BASED_SHUFFLE = bool(
+    os.environ.get("RAY_DATA_PUSH_BASED_SHUFFLE", None)
+)
+DEFAULT_SCHEDULING_STRATEGY = "SPREAD"
+# This default enables locality-based scheduling in Ray for tasks where arg data
+# transfer is a bottleneck.
+DEFAULT_SCHEDULING_STRATEGY_LARGE_ARGS = "DEFAULT"
+DEFAULT_LARGE_ARGS_THRESHOLD = 50 * 1024 * 1024
+DEFAULT_USE_POLARS = False
+DEFAULT_EAGER_FREE = bool(int(os.environ.get("RAY_DATA_EAGER_FREE", "1")))
+DEFAULT_DECODING_SIZE_ESTIMATION_ENABLED = True
+DEFAULT_MIN_PARALLELISM = 200
+DEFAULT_ENABLE_TENSOR_EXTENSION_CASTING = True
+# NOTE: V1 tensor type format only supports tensors of no more than 2Gb in
+#       total cumulative size (due to it internally utilizing int32 offsets)
+#
+#       V2 in turn relies on int64 offsets, therefore having a limit of ~9Eb (exabytes)
+DEFAULT_USE_ARROW_TENSOR_V2 = env_bool("RAY_DATA_USE_ARROW_TENSOR_V2", True)
+DEFAULT_AUTO_LOG_STATS = False
+DEFAULT_VERBOSE_STATS_LOG = False
+DEFAULT_TRACE_ALLOCATIONS = bool(int(os.environ.get("RAY_DATA_TRACE_ALLOCATIONS", "0")))
+DEFAULT_LOG_INTERNAL_STACK_TRACE_TO_STDOUT = env_bool(
+    "RAY_DATA_LOG_INTERNAL_STACK_TRACE_TO_STDOUT", False
+)
+DEFAULT_RAY_DATA_RAISE_ORIGINAL_MAP_EXCEPTION = env_bool(
+    "RAY_DATA_RAISE_ORIGINAL_MAP_EXCEPTION", False
+)
+DEFAULT_USE_RAY_TQDM = bool(int(os.environ.get("RAY_TQDM", "1")))
+# Globally enable or disable all progress bars.
+# If this is False, both the global and operator-level progress bars are disabled.
+DEFAULT_ENABLE_PROGRESS_BARS = not bool(
+    env_integer("RAY_DATA_DISABLE_PROGRESS_BARS", 0)
+)
+DEFAULT_ENABLE_PROGRESS_BAR_NAME_TRUNCATION = env_bool(
+    "RAY_DATA_ENABLE_PROGRESS_BAR_NAME_TRUNCATION", True
+)
+DEFAULT_ENABLE_GET_OBJECT_LOCATIONS_FOR_METRICS = False
+# `write_file_retry_on_errors` is deprecated in favor of `retried_io_errors`. You
+# shouldn't need to modify `DEFAULT_WRITE_FILE_RETRY_ON_ERRORS`.
+DEFAULT_WRITE_FILE_RETRY_ON_ERRORS = (
+    "AWS Error INTERNAL_FAILURE",
+    "AWS Error NETWORK_CONNECTION",
+    "AWS Error SLOW_DOWN",
+    "AWS Error UNKNOWN (HTTP status 503)",
+)
+DEFAULT_RETRIED_IO_ERRORS = (
+    "AWS Error INTERNAL_FAILURE",
+    "AWS Error NETWORK_CONNECTION",
+    "AWS Error SLOW_DOWN",
+    "AWS Error UNKNOWN (HTTP status 503)",
+    "AWS Error SERVICE_UNAVAILABLE",
+)
+DEFAULT_WARN_ON_DRIVER_MEMORY_USAGE_BYTES = 2 * 1024 * 1024 * 1024
+DEFAULT_ACTOR_TASK_RETRY_ON_ERRORS = False
+DEFAULT_ENABLE_OP_RESOURCE_RESERVATION = env_bool(
+    "RAY_DATA_ENABLE_OP_RESOURCE_RESERVATION", True
+)
+DEFAULT_OP_RESOURCE_RESERVATION_RATIO = float(
+    os.environ.get("RAY_DATA_OP_RESERVATION_RATIO", "0.5")
+)
+DEFAULT_MAX_ERRORED_BLOCKS = 0
+# Use this to prefix important warning messages for the user.
+WARN_PREFIX = "⚠️ "
+# Use this to prefix important success messages for the user.
+OK_PREFIX = "✔️ "
+# Default batch size for batch transformations.
+DEFAULT_BATCH_SIZE = 1024
+# Default value of the max number of blocks that can be buffered at the
+# streaming generator of each `DataOpTask`.
+# Note, if this value is too large, we'll need to allocate more memory
+# buffer for the pending task outputs, which may lead to bad performance
+# as we may not have enough memory buffer for the operator outputs.
+# If the value is too small, the task may be frequently blocked due to
+# streaming generator backpressure.
+DEFAULT_MAX_NUM_BLOCKS_IN_STREAMING_GEN_BUFFER = 2
+# Default value for whether or not to try to create directories for write
+# calls if the URI is an S3 URI.
+DEFAULT_S3_TRY_CREATE_DIR = False
+DEFAULT_WAIT_FOR_MIN_ACTORS_S = env_integer(
+    "RAY_DATA_DEFAULT_WAIT_FOR_MIN_ACTORS_S", 60 * 10
+)
+def _execution_options_factory() -> "ExecutionOptions":
+    # Lazily import to avoid circular dependencies.
+    from ray.data._internal.execution.interfaces import ExecutionOptions
+    return ExecutionOptions()
+@DeveloperAPI
+@dataclass
+class DataContext:
+    """Global settings for Ray Data.
+    Configure this class to enable advanced features and tune performance.
+    .. warning::
+        Apply changes before creating a :class:`~ray.data.Dataset`. Changes made after
+        won't take effect.
+    .. note::
+        This object is automatically propagated to workers. Access it from the driver
+        and remote workers with :meth:`DataContext.get_current()`.
+    Examples:
+        >>> from ray.data import DataContext
+        >>> DataContext.get_current().enable_progress_bars = False
+    Args:
+        target_max_block_size: The max target block size in bytes for reads and
+            transformations.
+        target_shuffle_max_block_size: The max target block size in bytes for shuffle
+            ops like ``random_shuffle``, ``sort``, and ``repartition``.
+        target_min_block_size: Ray Data avoids creating blocks smaller than this
+            size in bytes on read. This takes precedence over
+            ``read_op_min_num_blocks``.
+        streaming_read_buffer_size: Buffer size when doing streaming reads from local or
+            remote storage.
+        enable_pandas_block: Whether pandas block format is enabled.
+        actor_prefetcher_enabled: Whether to use actor based block prefetcher.
+        use_push_based_shuffle: Whether to use push-based shuffle.
+        pipeline_push_based_shuffle_reduce_tasks:
+        scheduling_strategy: The global scheduling strategy. For tasks with large args,
+            ``scheduling_strategy_large_args`` takes precedence.
+        scheduling_strategy_large_args: Scheduling strategy for tasks with large args.
+        large_args_threshold: Size in bytes after which point task arguments are
+            considered large. Choose a value so that the data transfer overhead is
+            significant in comparison to task scheduling (i.e., low tens of ms).
+        use_polars: Whether to use Polars for tabular dataset sorts, groupbys, and
+            aggregations.
+        eager_free: Whether to eagerly free memory.
+        decoding_size_estimation: Whether to estimate in-memory decoding data size for
+            data source.
+        min_parallelism: This setting is deprecated. Use ``read_op_min_num_blocks``
+            instead.
+        read_op_min_num_blocks: Minimum number of read output blocks for a dataset.
+        enable_tensor_extension_casting: Whether to automatically cast NumPy ndarray
+            columns in Pandas DataFrames to tensor extension columns.
+        use_arrow_tensor_v2: Config enabling V2 version of ArrowTensorArray supporting
+            tensors > 2Gb in size (off by default)
+        enable_fallback_to_arrow_object_ext_type: Enables fallback to serialize column
+            values not suppported by Arrow natively (like user-defined custom Python
+            classes for ex, etc) using `ArrowPythonObjectType` (simply serializing
+            these as bytes)
+        enable_auto_log_stats: Whether to automatically log stats after execution. If
+            disabled, you can still manually print stats with ``Dataset.stats()``.
+        verbose_stats_logs: Whether stats logs should be verbose. This includes fields
+            such as `extra_metrics` in the stats output, which are excluded by default.
+        trace_allocations: Whether to trace allocations / eager free. This adds
+            significant performance overheads and should only be used for debugging.
+        execution_options: The
+            :class:`~ray.data._internal.execution.interfaces.execution_options.ExecutionOptions`
+            to use.
+        use_ray_tqdm: Whether to enable distributed tqdm.
+        enable_progress_bars: Whether to enable progress bars.
+        enable_progress_bar_name_truncation: If True, the name of the progress bar
+            (often the operator name) will be truncated if it exceeds
+            `ProgressBar.MAX_NAME_LENGTH`. Otherwise, the full operator name is shown.
+        enable_get_object_locations_for_metrics: Whether to enable
+            ``get_object_locations`` for metrics.
+        write_file_retry_on_errors: A list of substrings of error messages that should
+            trigger a retry when writing files. This is useful for handling transient
+            errors when writing to remote storage systems.
+        warn_on_driver_memory_usage_bytes: If driver memory exceeds this threshold,
+            Ray Data warns you. For now, this only applies to shuffle ops because most
+            other ops are unlikely to use as much driver memory.
+        actor_task_retry_on_errors: The application-level errors that actor task should
+            retry. This follows same format as :ref:`retry_exceptions <task-retries>` in
+            Ray Core. Default to `False` to not retry on any errors. Set to `True` to
+            retry all errors, or set to a list of errors to retry.
+        enable_op_resource_reservation: Whether to reserve resources for each operator.
+        op_resource_reservation_ratio: The ratio of the total resources to reserve for
+            each operator.
+        max_errored_blocks: Max number of blocks that are allowed to have errors,
+            unlimited if negative. This option allows application-level exceptions in
+            block processing tasks. These exceptions may be caused by UDFs (e.g., due to
+            corrupted data samples) or IO errors. Data in the failed blocks are dropped.
+            This option can be useful to prevent a long-running job from failing due to
+            a small number of bad blocks.
+        log_internal_stack_trace_to_stdout: Whether to include internal Ray Data/Ray
+            Core code stack frames when logging to stdout. The full stack trace is
+            always written to the Ray Data log file.
+        raise_original_map_exception: Whether to raise the original exception
+            encountered in map UDF instead of wrapping it in a `UserCodeException`.
+        print_on_execution_start: If ``True``, print execution information when
+            execution starts.
+        s3_try_create_dir: If ``True``, try to create directories on S3 when a write
+            call is made with a S3 URI.
+        wait_for_min_actors_s: The default time to wait for minimum requested
+            actors to start before raising a timeout, in seconds.
+        retried_io_errors: A list of substrings of error messages that should
+            trigger a retry when reading or writing files. This is useful for handling
+            transient errors when reading from remote storage systems.
+    """
+    target_max_block_size: int = DEFAULT_TARGET_MAX_BLOCK_SIZE
+    target_shuffle_max_block_size: int = DEFAULT_SHUFFLE_TARGET_MAX_BLOCK_SIZE
+    target_min_block_size: int = DEFAULT_TARGET_MIN_BLOCK_SIZE
+    streaming_read_buffer_size: int = DEFAULT_STREAMING_READ_BUFFER_SIZE
+    enable_pandas_block: bool = DEFAULT_ENABLE_PANDAS_BLOCK
+    actor_prefetcher_enabled: bool = DEFAULT_ACTOR_PREFETCHER_ENABLED
+    use_push_based_shuffle: bool = DEFAULT_USE_PUSH_BASED_SHUFFLE
+    pipeline_push_based_shuffle_reduce_tasks: bool = True
+    scheduling_strategy: SchedulingStrategyT = DEFAULT_SCHEDULING_STRATEGY
+    scheduling_strategy_large_args: SchedulingStrategyT = (
+        DEFAULT_SCHEDULING_STRATEGY_LARGE_ARGS
+    )
+    large_args_threshold: int = DEFAULT_LARGE_ARGS_THRESHOLD
+    use_polars: bool = DEFAULT_USE_POLARS
+    eager_free: bool = DEFAULT_EAGER_FREE
+    decoding_size_estimation: bool = DEFAULT_DECODING_SIZE_ESTIMATION_ENABLED
+    min_parallelism: int = DEFAULT_MIN_PARALLELISM
+    read_op_min_num_blocks: int = DEFAULT_READ_OP_MIN_NUM_BLOCKS
+    enable_tensor_extension_casting: bool = DEFAULT_ENABLE_TENSOR_EXTENSION_CASTING
+    use_arrow_tensor_v2: bool = DEFAULT_USE_ARROW_TENSOR_V2
+    enable_fallback_to_arrow_object_ext_type: Optional[bool] = None
+    enable_auto_log_stats: bool = DEFAULT_AUTO_LOG_STATS
+    verbose_stats_logs: bool = DEFAULT_VERBOSE_STATS_LOG
+    trace_allocations: bool = DEFAULT_TRACE_ALLOCATIONS
+    execution_options: "ExecutionOptions" = field(
+        default_factory=_execution_options_factory
+    )
+    use_ray_tqdm: bool = DEFAULT_USE_RAY_TQDM
+    enable_progress_bars: bool = DEFAULT_ENABLE_PROGRESS_BARS
+    # By default, enable the progress bar for operator-level progress.
+    # In __post_init__(), we disable operator-level progress
+    # bars when running in a Ray job.
+    enable_operator_progress_bars: bool = True
+    enable_progress_bar_name_truncation: bool = (
+        DEFAULT_ENABLE_PROGRESS_BAR_NAME_TRUNCATION
+    )
+    enable_get_object_locations_for_metrics: bool = (
+        DEFAULT_ENABLE_GET_OBJECT_LOCATIONS_FOR_METRICS
+    )
+    write_file_retry_on_errors: List[str] = DEFAULT_WRITE_FILE_RETRY_ON_ERRORS
+    warn_on_driver_memory_usage_bytes: int = DEFAULT_WARN_ON_DRIVER_MEMORY_USAGE_BYTES
+    actor_task_retry_on_errors: Union[
+        bool, List[BaseException]
+    ] = DEFAULT_ACTOR_TASK_RETRY_ON_ERRORS
+    op_resource_reservation_enabled: bool = DEFAULT_ENABLE_OP_RESOURCE_RESERVATION
+    op_resource_reservation_ratio: float = DEFAULT_OP_RESOURCE_RESERVATION_RATIO
+    max_errored_blocks: int = DEFAULT_MAX_ERRORED_BLOCKS
+    log_internal_stack_trace_to_stdout: bool = (
+        DEFAULT_LOG_INTERNAL_STACK_TRACE_TO_STDOUT
+    )
+    raise_original_map_exception: bool = DEFAULT_RAY_DATA_RAISE_ORIGINAL_MAP_EXCEPTION
+    print_on_execution_start: bool = True
+    s3_try_create_dir: bool = DEFAULT_S3_TRY_CREATE_DIR
+    wait_for_min_actors_s: int = DEFAULT_WAIT_FOR_MIN_ACTORS_S
+    retried_io_errors: List[str] = field(
+        default_factory=lambda: list(DEFAULT_RETRIED_IO_ERRORS)
+    )
+    override_object_store_memory_limit_fraction: float = None
+    def __post_init__(self):
+        # The additonal ray remote args that should be added to
+        # the task-pool-based data tasks.
+        self._task_pool_data_task_remote_args: Dict[str, Any] = {}
+        # The extra key-value style configs.
+        # These configs are managed by individual components or plugins via
+        # `set_config`, `get_config` and `remove_config`.
+        # The reason why we use a dict instead of individual fields is to decouple
+        # the DataContext from the plugin implementations, as well as to avoid
+        # circular dependencies.
+        self._kv_configs: Dict[str, Any] = {}
+        self._max_num_blocks_in_streaming_gen_buffer = (
+            DEFAULT_MAX_NUM_BLOCKS_IN_STREAMING_GEN_BUFFER
+        )
+        is_ray_job = os.environ.get("RAY_JOB_ID") is not None
+        if is_ray_job:
+            is_driver = ray.get_runtime_context().worker.mode != WORKER_MODE
+            if is_driver and log_once(
+                "ray_data_disable_operator_progress_bars_in_ray_jobs"
+            ):
+                logger.info(
+                    "Disabling operator-level progress bars by default in Ray Jobs. "
+                    "To enable progress bars for all operators, set "
+                    "`ray.data.DataContext.get_current()"
+                    ".enable_operator_progress_bars = True`."
+                )
+            # Disable operator-level progress bars by default in Ray jobs.
+            # The global progress bar for the overall Dataset execution will
+            # still be enabled, unless the user also sets
+            # `ray.data.DataContext.get_current().enable_progress_bars = False`.
+            self.enable_operator_progress_bars = False
+        else:
+            # When not running in Ray job, operator-level progress
+            # bars are enabled by default.
+            self.enable_operator_progress_bars = True
+    def __setattr__(self, name: str, value: Any) -> None:
+        if (
+            name == "write_file_retry_on_errors"
+            and value != DEFAULT_WRITE_FILE_RETRY_ON_ERRORS
+        ):
+            warnings.warn(
+                "`write_file_retry_on_errors` is deprecated. Configure "
+                "`retried_io_errors` instead.",
+                DeprecationWarning,
+            )
+        super().__setattr__(name, value)
+    @staticmethod
+    def get_current() -> "DataContext":
+        """Get or create the current DataContext.
+        When a Dataset is created, the current DataContext will be sealed.
+        Changes to `DataContext.get_current()` will not impact existing Datasets.
+        Examples:
+            .. testcode::
+                import ray
+                context = ray.data.DataContext.get_current()
+                context.target_max_block_size = 100 * 1024 ** 2
+                ds1 = ray.data.range(1)
+                context.target_max_block_size = 1 * 1024 ** 2
+                ds2 = ray.data.range(1)
+                # ds1's target_max_block_size will be 100MB
+                ds1.take_all()
+                # ds2's target_max_block_size will be 1MB
+                ds2.take_all()
+        Developer notes: Avoid using `DataContext.get_current()` in data
+        internal components, use the DataContext object captured in the
+        Dataset and pass it around as arguments.
+        """
+        global _default_context
+        with _context_lock:
+            if _default_context is None:
+                _default_context = DataContext()
+            return _default_context
+    @staticmethod
+    def _set_current(context: "DataContext") -> None:
+        """Set the current context in a remote worker.
+        This is used internally by Dataset to propagate the driver context to
+        remote workers used for parallelization.
+        """
+        global _default_context
+        _default_context = context
+    def get_config(self, key: str, default: Any = None) -> Any:
+        """Get the value for a key-value style config.
+        Args:
+            key: The key of the config.
+            default: The default value to return if the key is not found.
+        Returns: The value for the key, or the default value if the key is not found.
+        """
+        return self._kv_configs.get(key, default)
+    def set_config(self, key: str, value: Any) -> None:
+        """Set the value for a key-value style config.
+        Args:
+            key: The key of the config.
+            value: The value of the config.
+        """
+        self._kv_configs[key] = value
+    def remove_config(self, key: str) -> None:
+        """Remove a key-value style config.
+        Args:
+            key: The key of the config.
+        """
+        self._kv_configs.pop(key, None)
+# Backwards compatibility alias.
+DatasetContext = DataContext

.venv/lib/python3.11/site-packages/ray/data/dataset.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/ray/data/datasource/datasink.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import logging
+from dataclasses import dataclass
+from typing import Generic, Iterable, List, Optional, TypeVar
+import ray
+from ray.data._internal.execution.interfaces import TaskContext
+from ray.data.block import Block, BlockAccessor
+from ray.util.annotations import DeveloperAPI
+logger = logging.getLogger(__name__)
+WriteReturnType = TypeVar("WriteReturnType")
+"""Generic type for the return value of `Datasink.write`."""
+@dataclass
+@DeveloperAPI
+class WriteResult(Generic[WriteReturnType]):
+    """Aggregated result of the Datasink write operations."""
+    # Total number of written rows.
+    num_rows: int
+    # Total size in bytes of written data.
+    size_bytes: int
+    # All returned values of `Datasink.write`.
+    write_returns: List[WriteReturnType]
+@DeveloperAPI
+class Datasink(Generic[WriteReturnType]):
+    """Interface for defining write-related logic.
+    If you want to write data to something that isn't built-in, subclass this class
+    and call :meth:`~ray.data.Dataset.write_datasink`.
+    """
+    def on_write_start(self) -> None:
+        """Callback for when a write job starts.
+        Use this method to perform setup for write tasks. For example, creating a
+        staging bucket in S3.
+        """
+        pass
+    def write(
+        self,
+        blocks: Iterable[Block],
+        ctx: TaskContext,
+    ) -> WriteReturnType:
+        """Write blocks. This is used by a single write task.
+        Args:
+            blocks: Generator of data blocks.
+            ctx: ``TaskContext`` for the write task.
+        Returns:
+            Result of this write task. When the entire write operator finishes,
+            All returned values will be passed as `WriteResult.write_returns`
+            to `Datasink.on_write_complete`.
+        """
+        raise NotImplementedError
+    def on_write_complete(self, write_result: WriteResult[WriteReturnType]):
+        """Callback for when a write job completes.
+        This can be used to "commit" a write output. This method must
+        succeed prior to ``write_datasink()`` returning to the user. If this
+        method fails, then ``on_write_failed()`` is called.
+        Args:
+            write_result: Aggregated result of the
+            the Write operator, containing write results and stats.
+        """
+        pass
+    def on_write_failed(self, error: Exception) -> None:
+        """Callback for when a write job fails.
+        This is called on a best-effort basis on write failures.
+        Args:
+            error: The first error encountered.
+        """
+        pass
+    def get_name(self) -> str:
+        """Return a human-readable name for this datasink.
+        This is used as the names of the write tasks.
+        """
+        name = type(self).__name__
+        datasink_suffix = "Datasink"
+        if name.startswith("_"):
+            name = name[1:]
+        if name.endswith(datasink_suffix):
+            name = name[: -len(datasink_suffix)]
+        return name
+    @property
+    def supports_distributed_writes(self) -> bool:
+        """If ``False``, only launch write tasks on the driver's node."""
+        return True
+    @property
+    def min_rows_per_write(self) -> Optional[int]:
+        """The target number of rows to pass to each :meth:`~ray.data.Datasink.write` call.
+        If ``None``, Ray Data passes a system-chosen number of rows.
+        """
+        return None
+@DeveloperAPI
+class DummyOutputDatasink(Datasink[None]):
+    """An example implementation of a writable datasource for testing.
+    Examples:
+        >>> import ray
+        >>> from ray.data.datasource import DummyOutputDatasink
+        >>> output = DummyOutputDatasink()
+        >>> ray.data.range(10).write_datasink(output)
+        >>> assert output.num_ok == 1
+    """
+    def __init__(self):
+        ctx = ray.data.DataContext.get_current()
+        # Setup a dummy actor to send the data. In a real datasource, write
+        # tasks would send data to an external system instead of a Ray actor.
+        @ray.remote(scheduling_strategy=ctx.scheduling_strategy)
+        class DataSink:
+            def __init__(self):
+                self.rows_written = 0
+                self.enabled = True
+            def write(self, block: Block) -> None:
+                block = BlockAccessor.for_block(block)
+                self.rows_written += block.num_rows()
+            def get_rows_written(self):
+                return self.rows_written
+        self.data_sink = DataSink.remote()
+        self.num_ok = 0
+        self.num_failed = 0
+        self.enabled = True
+    def write(
+        self,
+        blocks: Iterable[Block],
+        ctx: TaskContext,
+    ) -> None:
+        tasks = []
+        if not self.enabled:
+            raise ValueError("disabled")
+        for b in blocks:
+            tasks.append(self.data_sink.write.remote(b))
+        ray.get(tasks)
+    def on_write_complete(self, write_result: WriteResult[None]):
+        self.num_ok += 1
+    def on_write_failed(self, error: Exception) -> None:
+        self.num_failed += 1

.venv/lib/python3.11/site-packages/ray/data/datasource/datasource.py ADDED Viewed

	@@ -0,0 +1,243 @@

+from typing import Callable, Iterable, List, Optional
+import numpy as np
+from ray.data._internal.util import _check_pyarrow_version
+from ray.data.block import Block, BlockMetadata
+from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI
+@PublicAPI
+class Datasource:
+    """Interface for defining a custom :class:`~ray.data.Dataset` datasource.
+    To read a datasource into a dataset, use :meth:`~ray.data.read_datasource`.
+    """  # noqa: E501
+    @Deprecated
+    def create_reader(self, **read_args) -> "Reader":
+        """
+        Deprecated: Implement :meth:`~ray.data.Datasource.get_read_tasks` and
+        :meth:`~ray.data.Datasource.estimate_inmemory_data_size` instead.
+        """
+        return _LegacyDatasourceReader(self, **read_args)
+    @Deprecated
+    def prepare_read(self, parallelism: int, **read_args) -> List["ReadTask"]:
+        """
+        Deprecated: Implement :meth:`~ray.data.Datasource.get_read_tasks` and
+        :meth:`~ray.data.Datasource.estimate_inmemory_data_size` instead.
+        """
+        raise NotImplementedError
+    def get_name(self) -> str:
+        """Return a human-readable name for this datasource.
+        This will be used as the names of the read tasks.
+        """
+        name = type(self).__name__
+        datasource_suffix = "Datasource"
+        if name.endswith(datasource_suffix):
+            name = name[: -len(datasource_suffix)]
+        return name
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        """Return an estimate of the in-memory data size, or None if unknown.
+        Note that the in-memory data size may be larger than the on-disk data size.
+        """
+        raise NotImplementedError
+    def get_read_tasks(self, parallelism: int) -> List["ReadTask"]:
+        """Execute the read and return read tasks.
+        Args:
+            parallelism: The requested read parallelism. The number of read
+                tasks should equal to this value if possible.
+        Returns:
+            A list of read tasks that can be executed to read blocks from the
+            datasource in parallel.
+        """
+        raise NotImplementedError
+    @property
+    def should_create_reader(self) -> bool:
+        has_implemented_get_read_tasks = (
+            type(self).get_read_tasks is not Datasource.get_read_tasks
+        )
+        has_implemented_estimate_inmemory_data_size = (
+            type(self).estimate_inmemory_data_size
+            is not Datasource.estimate_inmemory_data_size
+        )
+        return (
+            not has_implemented_get_read_tasks
+            or not has_implemented_estimate_inmemory_data_size
+        )
+    @property
+    def supports_distributed_reads(self) -> bool:
+        """If ``False``, only launch read tasks on the driver's node."""
+        return True
+@Deprecated
+class Reader:
+    """A bound read operation for a :class:`~ray.data.Datasource`.
+    This is a stateful class so that reads can be prepared in multiple stages.
+    For example, it is useful for :class:`Datasets <ray.data.Dataset>` to know the
+    in-memory size of the read prior to executing it.
+    """
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        """Return an estimate of the in-memory data size, or None if unknown.
+        Note that the in-memory data size may be larger than the on-disk data size.
+        """
+        raise NotImplementedError
+    def get_read_tasks(self, parallelism: int) -> List["ReadTask"]:
+        """Execute the read and return read tasks.
+        Args:
+            parallelism: The requested read parallelism. The number of read
+                tasks should equal to this value if possible.
+            read_args: Additional kwargs to pass to the datasource impl.
+        Returns:
+            A list of read tasks that can be executed to read blocks from the
+            datasource in parallel.
+        """
+        raise NotImplementedError
+class _LegacyDatasourceReader(Reader):
+    def __init__(self, datasource: Datasource, **read_args):
+        self._datasource = datasource
+        self._read_args = read_args
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        return None
+    def get_read_tasks(self, parallelism: int) -> List["ReadTask"]:
+        return self._datasource.prepare_read(parallelism, **self._read_args)
+@DeveloperAPI
+class ReadTask(Callable[[], Iterable[Block]]):
+    """A function used to read blocks from the :class:`~ray.data.Dataset`.
+    Read tasks are generated by :meth:`~ray.data.Datasource.get_read_tasks`,
+    and return a list of ``ray.data.Block`` when called. Initial metadata about the read
+    operation can be retrieved via the ``metadata`` attribute prior to executing the
+    read. Final metadata is returned after the read along with the blocks.
+    Ray will execute read tasks in remote functions to parallelize execution.
+    Note that the number of blocks returned can vary at runtime. For example,
+    if a task is reading a single large file it can return multiple blocks to
+    avoid running out of memory during the read.
+    The initial metadata should reflect all the blocks returned by the read,
+    e.g., if the metadata says ``num_rows=1000``, the read can return a single
+    block of 1000 rows, or multiple blocks with 1000 rows altogether.
+    The final metadata (returned with the actual block) reflects the exact
+    contents of the block itself.
+    """
+    def __init__(self, read_fn: Callable[[], Iterable[Block]], metadata: BlockMetadata):
+        self._metadata = metadata
+        self._read_fn = read_fn
+    @property
+    def metadata(self) -> BlockMetadata:
+        return self._metadata
+    @property
+    def read_fn(self) -> Callable[[], Iterable[Block]]:
+        return self._read_fn
+    def __call__(self) -> Iterable[Block]:
+        result = self._read_fn()
+        if not hasattr(result, "__iter__"):
+            DeprecationWarning(
+                "Read function must return Iterable[Block], got {}. "
+                "Probably you need to return `[block]` instead of "
+                "`block`.".format(result)
+            )
+        yield from result
+@DeveloperAPI
+class RandomIntRowDatasource(Datasource):
+    """An example datasource that generates rows with random int64 columns.
+    Examples:
+        >>> import ray
+        >>> from ray.data.datasource import RandomIntRowDatasource
+        >>> source = RandomIntRowDatasource() # doctest: +SKIP
+        >>> ray.data.read_datasource( # doctest: +SKIP
+        ...     source, n=10, num_columns=2).take()
+        {'c_0': 1717767200176864416, 'c_1': 999657309586757214}
+        {'c_0': 4983608804013926748, 'c_1': 1160140066899844087}
+    """
+    def __init__(self, n: int, num_columns: int):
+        self._n = n
+        self._num_columns = num_columns
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        return self._n * self._num_columns * 8
+    def get_read_tasks(
+        self,
+        parallelism: int,
+    ) -> List[ReadTask]:
+        _check_pyarrow_version()
+        import pyarrow
+        read_tasks: List[ReadTask] = []
+        n = self._n
+        num_columns = self._num_columns
+        block_size = max(1, n // parallelism)
+        def make_block(count: int, num_columns: int) -> Block:
+            return pyarrow.Table.from_arrays(
+                np.random.randint(
+                    np.iinfo(np.int64).max, size=(num_columns, count), dtype=np.int64
+                ),
+                names=[f"c_{i}" for i in range(num_columns)],
+            )
+        schema = pyarrow.Table.from_pydict(
+            {f"c_{i}": [0] for i in range(num_columns)}
+        ).schema
+        i = 0
+        while i < n:
+            count = min(block_size, n - i)
+            meta = BlockMetadata(
+                num_rows=count,
+                size_bytes=8 * count * num_columns,
+                schema=schema,
+                input_files=None,
+                exec_stats=None,
+            )
+            read_tasks.append(
+                ReadTask(
+                    lambda count=count, num_columns=num_columns: [
+                        make_block(count, num_columns)
+                    ],
+                    meta,
+                )
+            )
+            i += block_size
+        return read_tasks
+    def get_name(self) -> str:
+        """Return a human-readable name for this datasource.
+        This will be used as the names of the read tasks.
+        Note: overrides the base `Datasource` method.
+        """
+        return "RandomInt"

.venv/lib/python3.11/site-packages/ray/data/datasource/file_based_datasource.py ADDED Viewed

	@@ -0,0 +1,572 @@

+import io
+import logging
+from dataclasses import dataclass
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Union,
+)
+import numpy as np
+import ray
+from ray.data._internal.util import (
+    _check_pyarrow_version,
+    _is_local_scheme,
+    call_with_retry,
+    make_async_gen,
+)
+from ray.data.block import Block, BlockAccessor
+from ray.data.context import DataContext
+from ray.data.datasource.datasource import Datasource, ReadTask
+from ray.data.datasource.file_meta_provider import (
+    BaseFileMetadataProvider,
+    DefaultFileMetadataProvider,
+)
+from ray.data.datasource.partitioning import (
+    Partitioning,
+    PathPartitionFilter,
+    PathPartitionParser,
+)
+from ray.data.datasource.path_util import (
+    _has_file_extension,
+    _resolve_paths_and_filesystem,
+)
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    import pandas as pd
+    import pyarrow
+logger = logging.getLogger(__name__)
+# We should parallelize file size fetch operations beyond this threshold.
+FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD = 16
+# 16 file size fetches from S3 takes ~1.5 seconds with Arrow's S3FileSystem.
+PATHS_PER_FILE_SIZE_FETCH_TASK = 16
+# The max retry backoff in seconds for opening file.
+OPEN_FILE_RETRY_MAX_BACKOFF_SECONDS = 32
+# The max number of attempts for opening file.
+OPEN_FILE_MAX_ATTEMPTS = 10
+@DeveloperAPI
+@dataclass
+class FileShuffleConfig:
+    """Configuration for file shuffling.
+    This configuration object controls how files are shuffled while reading file-based
+    datasets.
+    .. note::
+        Even if you provided a seed, you might still observe a non-deterministic row
+        order. This is because tasks are executed in parallel and their completion
+        order might vary. If you need to preserve the order of rows, set
+        `DataContext.get_current().execution_options.preserve_order`.
+    Args:
+        seed: An optional integer seed for the file shuffler. If provided, Ray Data
+            shuffles files deterministically based on this seed.
+    Example:
+        >>> import ray
+        >>> from ray.data import FileShuffleConfig
+        >>> shuffle = FileShuffleConfig(seed=42)
+        >>> ds = ray.data.read_images("s3://anonymous@ray-example-data/batoidea", shuffle=shuffle)
+    """  # noqa: E501
+    seed: Optional[int] = None
+    def __post_init__(self):
+        """Ensure that the seed is either None or an integer."""
+        if self.seed is not None and not isinstance(self.seed, int):
+            raise ValueError("Seed must be an integer or None.")
+@DeveloperAPI
+class FileBasedDatasource(Datasource):
+    """File-based datasource for reading files.
+    Don't use this class directly. Instead, subclass it and implement `_read_stream()`.
+    """
+    # If `_WRITE_FILE_PER_ROW` is `True`, this datasource calls `_write_row` and writes
+    # each row to a file. Otherwise, this datasource calls `_write_block` and writes
+    # each block to a file.
+    _WRITE_FILE_PER_ROW = False
+    _FILE_EXTENSIONS: Optional[Union[str, List[str]]] = None
+    # Number of threads for concurrent reading within each read task.
+    # If zero or negative, reading will be performed in the main thread.
+    _NUM_THREADS_PER_TASK = 0
+    def __init__(
+        self,
+        paths: Union[str, List[str]],
+        *,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
+        open_stream_args: Optional[Dict[str, Any]] = None,
+        meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(),
+        partition_filter: PathPartitionFilter = None,
+        partitioning: Partitioning = None,
+        ignore_missing_paths: bool = False,
+        shuffle: Optional[Union[Literal["files"], FileShuffleConfig]] = None,
+        include_paths: bool = False,
+        file_extensions: Optional[List[str]] = None,
+    ):
+        _check_pyarrow_version()
+        self._supports_distributed_reads = not _is_local_scheme(paths)
+        if not self._supports_distributed_reads and ray.util.client.ray.is_connected():
+            raise ValueError(
+                "Because you're using Ray Client, read tasks scheduled on the Ray "
+                "cluster can't access your local files. To fix this issue, store "
+                "files in cloud storage or a distributed filesystem like NFS."
+            )
+        self._schema = schema
+        self._open_stream_args = open_stream_args
+        self._meta_provider = meta_provider
+        self._partition_filter = partition_filter
+        self._partitioning = partitioning
+        self._ignore_missing_paths = ignore_missing_paths
+        self._include_paths = include_paths
+        paths, self._filesystem = _resolve_paths_and_filesystem(paths, filesystem)
+        paths, file_sizes = map(
+            list,
+            zip(
+                *meta_provider.expand_paths(
+                    paths,
+                    self._filesystem,
+                    partitioning,
+                    ignore_missing_paths=ignore_missing_paths,
+                )
+            ),
+        )
+        if ignore_missing_paths and len(paths) == 0:
+            raise ValueError(
+                "None of the provided paths exist. "
+                "The 'ignore_missing_paths' field is set to True."
+            )
+        if self._partition_filter is not None:
+            # Use partition filter to skip files which are not needed.
+            path_to_size = dict(zip(paths, file_sizes))
+            paths = self._partition_filter(paths)
+            file_sizes = [path_to_size[p] for p in paths]
+            if len(paths) == 0:
+                raise ValueError(
+                    "No input files found to read. Please double check that "
+                    "'partition_filter' field is set properly."
+                )
+        if file_extensions is not None:
+            path_to_size = dict(zip(paths, file_sizes))
+            paths = [p for p in paths if _has_file_extension(p, file_extensions)]
+            file_sizes = [path_to_size[p] for p in paths]
+            if len(paths) == 0:
+                raise ValueError(
+                    "No input files found to read with the following file extensions: "
+                    f"{file_extensions}. Please double check that "
+                    "'file_extensions' field is set properly."
+                )
+        _validate_shuffle_arg(shuffle)
+        self._file_metadata_shuffler = None
+        if shuffle == "files":
+            self._file_metadata_shuffler = np.random.default_rng()
+        elif isinstance(shuffle, FileShuffleConfig):
+            # Create a NumPy random generator with a fixed seed if provided
+            self._file_metadata_shuffler = np.random.default_rng(shuffle.seed)
+        # Read tasks serialize `FileBasedDatasource` instances, and the list of paths
+        # can be large. To avoid slow serialization speeds, we store a reference to
+        # the paths rather than the paths themselves.
+        self._paths_ref = ray.put(paths)
+        self._file_sizes_ref = ray.put(file_sizes)
+    def _paths(self) -> List[str]:
+        return ray.get(self._paths_ref)
+    def _file_sizes(self) -> List[float]:
+        return ray.get(self._file_sizes_ref)
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        total_size = 0
+        for sz in self._file_sizes():
+            if sz is not None:
+                total_size += sz
+        return total_size
+    def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
+        import numpy as np
+        ctx = DataContext.get_current()
+        open_stream_args = self._open_stream_args
+        partitioning = self._partitioning
+        paths = self._paths()
+        file_sizes = self._file_sizes()
+        if self._file_metadata_shuffler is not None:
+            files_metadata = list(zip(paths, file_sizes))
+            shuffled_files_metadata = [
+                files_metadata[i]
+                for i in self._file_metadata_shuffler.permutation(len(files_metadata))
+            ]
+            paths, file_sizes = list(map(list, zip(*shuffled_files_metadata)))
+        read_stream = self._read_stream
+        filesystem = _wrap_s3_serialization_workaround(self._filesystem)
+        if open_stream_args is None:
+            open_stream_args = {}
+        open_input_source = self._open_input_source
+        def read_files(
+            read_paths: Iterable[str],
+        ) -> Iterable[Block]:
+            nonlocal filesystem, open_stream_args, partitioning
+            DataContext._set_current(ctx)
+            fs = _unwrap_s3_serialization_workaround(filesystem)
+            for read_path in read_paths:
+                partitions: Dict[str, str] = {}
+                if partitioning is not None:
+                    parse = PathPartitionParser(partitioning)
+                    partitions = parse(read_path)
+                with _open_file_with_retry(
+                    read_path,
+                    lambda read_path=read_path: open_input_source(
+                        fs, read_path, **open_stream_args
+                    ),
+                ) as f:
+                    for block in read_stream(f, read_path):
+                        if partitions:
+                            block = _add_partitions(block, partitions)
+                        if self._include_paths:
+                            block_accessor = BlockAccessor.for_block(block)
+                            block = block_accessor.append_column(
+                                "path", [read_path] * block_accessor.num_rows()
+                            )
+                        yield block
+        def create_read_task_fn(read_paths, num_threads):
+            def read_task_fn():
+                nonlocal num_threads, read_paths
+                # TODO: We should refactor the code so that we can get the results in
+                # order even when using multiple threads.
+                if ctx.execution_options.preserve_order:
+                    num_threads = 0
+                if num_threads > 0:
+                    if len(read_paths) < num_threads:
+                        num_threads = len(read_paths)
+                    logger.debug(
+                        f"Reading {len(read_paths)} files with {num_threads} threads."
+                    )
+                    yield from make_async_gen(
+                        iter(read_paths),
+                        read_files,
+                        num_workers=num_threads,
+                    )
+                else:
+                    logger.debug(f"Reading {len(read_paths)} files.")
+                    yield from read_files(read_paths)
+            return read_task_fn
+        # fix https://github.com/ray-project/ray/issues/24296
+        parallelism = min(parallelism, len(paths))
+        read_tasks = []
+        split_paths = np.array_split(paths, parallelism)
+        split_file_sizes = np.array_split(file_sizes, parallelism)
+        for read_paths, file_sizes in zip(split_paths, split_file_sizes):
+            if len(read_paths) <= 0:
+                continue
+            meta = self._meta_provider(
+                read_paths,
+                self._schema,
+                rows_per_file=self._rows_per_file(),
+                file_sizes=file_sizes,
+            )
+            read_task_fn = create_read_task_fn(read_paths, self._NUM_THREADS_PER_TASK)
+            read_task = ReadTask(read_task_fn, meta)
+            read_tasks.append(read_task)
+        return read_tasks
+    def _open_input_source(
+        self,
+        filesystem: "pyarrow.fs.FileSystem",
+        path: str,
+        **open_args,
+    ) -> "pyarrow.NativeFile":
+        """Opens a source path for reading and returns the associated Arrow NativeFile.
+        The default implementation opens the source path as a sequential input stream,
+        using ctx.streaming_read_buffer_size as the buffer size if none is given by the
+        caller.
+        Implementations that do not support streaming reads (e.g. that require random
+        access) should override this method.
+        """
+        import pyarrow as pa
+        from pyarrow.fs import HadoopFileSystem
+        ctx = DataContext.get_current()
+        compression = open_args.get("compression", None)
+        if compression is None:
+            try:
+                # If no compression manually given, try to detect
+                # compression codec from path.
+                compression = pa.Codec.detect(path).name
+            except (ValueError, TypeError):
+                # Arrow's compression inference on the file path
+                # doesn't work for Snappy, so we double-check ourselves.
+                import pathlib
+                suffix = pathlib.Path(path).suffix
+                if suffix and suffix[1:] == "snappy":
+                    compression = "snappy"
+                else:
+                    compression = None
+        buffer_size = open_args.pop("buffer_size", None)
+        if buffer_size is None:
+            buffer_size = ctx.streaming_read_buffer_size
+        if compression == "snappy":
+            # Arrow doesn't support streaming Snappy decompression since the canonical
+            # C++ Snappy library doesn't natively support streaming decompression. We
+            # works around this by manually decompressing the file with python-snappy.
+            open_args["compression"] = None
+        else:
+            open_args["compression"] = compression
+        file = call_with_retry(
+            lambda: filesystem.open_input_stream(
+                path, buffer_size=buffer_size, **open_args
+            ),
+            description=f"open file {path}",
+            match=ctx.retried_io_errors,
+        )
+        if compression == "snappy":
+            import snappy
+            stream = io.BytesIO()
+            if isinstance(filesystem, HadoopFileSystem):
+                snappy.hadoop_snappy.stream_decompress(src=file, dst=stream)
+            else:
+                snappy.stream_decompress(src=file, dst=stream)
+            stream.seek(0)
+            file = pa.PythonFile(stream, mode="r")
+        return file
+    def _rows_per_file(self):
+        """Returns the number of rows per file, or None if unknown."""
+        return None
+    def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]:
+        """Streaming read a single file.
+        This method should be implemented by subclasses.
+        """
+        raise NotImplementedError(
+            "Subclasses of FileBasedDatasource must implement _read_stream()."
+        )
+    @property
+    def supports_distributed_reads(self) -> bool:
+        return self._supports_distributed_reads
+def _add_partitions(
+    data: Union["pyarrow.Table", "pd.DataFrame"], partitions: Dict[str, Any]
+) -> Union["pyarrow.Table", "pd.DataFrame"]:
+    import pandas as pd
+    import pyarrow as pa
+    assert isinstance(data, (pa.Table, pd.DataFrame))
+    if isinstance(data, pa.Table):
+        return _add_partitions_to_table(data, partitions)
+    if isinstance(data, pd.DataFrame):
+        return _add_partitions_to_dataframe(data, partitions)
+def _add_partitions_to_table(
+    table: "pyarrow.Table", partitions: Dict[str, Any]
+) -> "pyarrow.Table":
+    import pyarrow as pa
+    import pyarrow.compute as pc
+    column_names = set(table.column_names)
+    for field, value in partitions.items():
+        column = pa.array([value] * len(table))
+        if field in column_names:
+            # TODO: Handle cast error.
+            column_type = table.schema.field(field).type
+            column = column.cast(column_type)
+            values_are_equal = pc.all(pc.equal(column, table[field]))
+            values_are_equal = values_are_equal.as_py()
+            if not values_are_equal:
+                raise ValueError(
+                    f"Partition column {field} exists in table data, but partition "
+                    f"value '{value}' is different from in-data values: "
+                    f"{table[field].unique().to_pylist()}."
+                )
+            i = table.schema.get_field_index(field)
+            table = table.set_column(i, field, column)
+        else:
+            table = table.append_column(field, column)
+    return table
+def _add_partitions_to_dataframe(
+    df: "pd.DataFrame", partitions: Dict[str, Any]
+) -> "pd.DataFrame":
+    import pandas as pd
+    for field, value in partitions.items():
+        column = pd.Series(data=[value] * len(df), name=field)
+        if field in df:
+            column = column.astype(df[field].dtype)
+            mask = df[field].notna()
+            if not df[field][mask].equals(column[mask]):
+                raise ValueError(
+                    f"Partition column {field} exists in table data, but partition "
+                    f"value '{value}' is different from in-data values: "
+                    f"{list(df[field].unique())}."
+                )
+        df[field] = column
+    return df
+def _wrap_s3_serialization_workaround(filesystem: "pyarrow.fs.FileSystem"):
+    # This is needed because pa.fs.S3FileSystem assumes pa.fs is already
+    # imported before deserialization. See #17085.
+    import pyarrow as pa
+    import pyarrow.fs
+    if isinstance(filesystem, pa.fs.S3FileSystem):
+        return _S3FileSystemWrapper(filesystem)
+    return filesystem
+def _unwrap_s3_serialization_workaround(
+    filesystem: Union["pyarrow.fs.FileSystem", "_S3FileSystemWrapper"]
+):
+    if isinstance(filesystem, _S3FileSystemWrapper):
+        return filesystem.unwrap()
+    else:
+        return filesystem
+class _S3FileSystemWrapper:
+    def __init__(self, fs: "pyarrow.fs.S3FileSystem"):
+        self._fs = fs
+    def unwrap(self):
+        return self._fs
+    @classmethod
+    def _reconstruct(cls, fs_reconstruct, fs_args):
+        # Implicitly trigger S3 subsystem initialization by importing
+        # pyarrow.fs.
+        import pyarrow.fs  # noqa: F401
+        return cls(fs_reconstruct(*fs_args))
+    def __reduce__(self):
+        return _S3FileSystemWrapper._reconstruct, self._fs.__reduce__()
+def _wrap_arrow_serialization_workaround(kwargs: dict) -> dict:
+    if "filesystem" in kwargs:
+        kwargs["filesystem"] = _wrap_s3_serialization_workaround(kwargs["filesystem"])
+    return kwargs
+def _unwrap_arrow_serialization_workaround(kwargs: dict) -> dict:
+    if isinstance(kwargs.get("filesystem"), _S3FileSystemWrapper):
+        kwargs["filesystem"] = kwargs["filesystem"].unwrap()
+    return kwargs
+def _resolve_kwargs(
+    kwargs_fn: Callable[[], Dict[str, Any]], **kwargs
+) -> Dict[str, Any]:
+    if kwargs_fn:
+        kwarg_overrides = kwargs_fn()
+        kwargs.update(kwarg_overrides)
+    return kwargs
+def _open_file_with_retry(
+    file_path: str,
+    open_file: Callable[[], "pyarrow.NativeFile"],
+) -> "pyarrow.NativeFile":
+    """Open file with an exponential backoff retry strategy.
+    This is to avoid transient task failure with remote storage (such as S3),
+    when the remote storage throttles the requests.
+    """
+    if OPEN_FILE_MAX_ATTEMPTS < 1:
+        raise ValueError(
+            "OPEN_FILE_MAX_ATTEMPTS cannot be negative or 0. Get: "
+            f"{OPEN_FILE_MAX_ATTEMPTS}"
+        )
+    return call_with_retry(
+        open_file,
+        description=f"open file {file_path}",
+        match=DataContext.get_current().retried_io_errors,
+        max_attempts=OPEN_FILE_MAX_ATTEMPTS,
+        max_backoff_s=OPEN_FILE_RETRY_MAX_BACKOFF_SECONDS,
+    )
+def _validate_shuffle_arg(shuffle: Optional[str]) -> None:
+    if not (
+        shuffle is None or shuffle == "files" or isinstance(shuffle, FileShuffleConfig)
+    ):
+        raise ValueError(
+            f"Invalid value for 'shuffle': {shuffle}. "
+            "Valid values are None, 'files', `FileShuffleConfig`."
+        )

.venv/lib/python3.11/site-packages/ray/data/datasource/file_meta_provider.py ADDED Viewed

	@@ -0,0 +1,484 @@

+import itertools
+import logging
+import os
+import pathlib
+import re
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import numpy as np
+import ray
+from ray.data._internal.progress_bar import ProgressBar
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data._internal.util import call_with_retry
+from ray.data.block import BlockMetadata
+from ray.data.datasource.partitioning import Partitioning
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    import pyarrow
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class FileMetadataProvider:
+    """Abstract callable that provides metadata for the files of a single dataset block.
+    Current subclasses:
+        - :class:`BaseFileMetadataProvider`
+        - :class:`ParquetMetadataProvider`
+    """
+    def _get_block_metadata(
+        self,
+        paths: List[str],
+        schema: Optional[Union[type, "pyarrow.lib.Schema"]],
+        **kwargs,
+    ) -> BlockMetadata:
+        """Resolves and returns block metadata for files in the given paths.
+        All file paths provided should belong to a single dataset block.
+        Args:
+            paths: The file paths for a single dataset block.
+            schema: The user-provided or inferred schema for the given paths,
+                if any.
+        Returns:
+            BlockMetadata aggregated across the given paths.
+        """
+        raise NotImplementedError
+    def __call__(
+        self,
+        paths: List[str],
+        schema: Optional[Union[type, "pyarrow.lib.Schema"]],
+        **kwargs,
+    ) -> BlockMetadata:
+        return self._get_block_metadata(paths, schema, **kwargs)
+@DeveloperAPI
+class BaseFileMetadataProvider(FileMetadataProvider):
+    """Abstract callable that provides metadata for
+    :class:`~ray.data.datasource.file_based_datasource.FileBasedDatasource`
+    implementations that reuse the base :meth:`~ray.data.Datasource.prepare_read`
+    method.
+    Also supports file and file size discovery in input directory paths.
+    Current subclasses:
+        - :class:`DefaultFileMetadataProvider`
+    """
+    def _get_block_metadata(
+        self,
+        paths: List[str],
+        schema: Optional[Union[type, "pyarrow.lib.Schema"]],
+        *,
+        rows_per_file: Optional[int],
+        file_sizes: List[Optional[int]],
+    ) -> BlockMetadata:
+        """Resolves and returns block metadata for files of a single dataset block.
+        Args:
+            paths: The file paths for a single dataset block. These
+                paths will always be a subset of those previously returned from
+                :meth:`.expand_paths`.
+            schema: The user-provided or inferred schema for the given file
+                paths, if any.
+            rows_per_file: The fixed number of rows per input file, or None.
+            file_sizes: Optional file size per input file previously returned
+                from :meth:`.expand_paths`, where `file_sizes[i]` holds the size of
+                the file at `paths[i]`.
+        Returns:
+            BlockMetadata aggregated across the given file paths.
+        """
+        raise NotImplementedError
+    def expand_paths(
+        self,
+        paths: List[str],
+        filesystem: Optional["pyarrow.fs.FileSystem"],
+        partitioning: Optional[Partitioning] = None,
+        ignore_missing_paths: bool = False,
+    ) -> Iterator[Tuple[str, int]]:
+        """Expands all paths into concrete file paths by walking directories.
+        Also returns a sidecar of file sizes.
+        The input paths must be normalized for compatibility with the input
+        filesystem prior to invocation.
+        Args:
+            paths: A list of file and/or directory paths compatible with the
+                given filesystem.
+            filesystem: The filesystem implementation that should be used for
+                expanding all paths and reading their files.
+            ignore_missing_paths: If True, ignores any file paths in ``paths`` that
+                are not found. Defaults to False.
+        Returns:
+            An iterator of `(file_path, file_size)` pairs. None may be returned for the
+            file size if it is either unknown or will be fetched later by
+            `_get_block_metadata()`, but the length of
+            both lists must be equal.
+        """
+        raise NotImplementedError
+@DeveloperAPI
+class DefaultFileMetadataProvider(BaseFileMetadataProvider):
+    """Default metadata provider for
+    :class:`~ray.data.datasource.file_based_datasource.FileBasedDatasource`
+    implementations that reuse the base `prepare_read` method.
+    Calculates block size in bytes as the sum of its constituent file sizes,
+    and assumes a fixed number of rows per file.
+    """
+    def _get_block_metadata(
+        self,
+        paths: List[str],
+        schema: Optional[Union[type, "pyarrow.lib.Schema"]],
+        *,
+        rows_per_file: Optional[int],
+        file_sizes: List[Optional[int]],
+    ) -> BlockMetadata:
+        if rows_per_file is None:
+            num_rows = None
+        else:
+            num_rows = len(paths) * rows_per_file
+        return BlockMetadata(
+            num_rows=num_rows,
+            size_bytes=None if None in file_sizes else int(sum(file_sizes)),
+            schema=schema,
+            input_files=paths,
+            exec_stats=None,
+        )  # Exec stats filled in later.
+    def expand_paths(
+        self,
+        paths: List[str],
+        filesystem: "pyarrow.fs.FileSystem",
+        partitioning: Optional[Partitioning] = None,
+        ignore_missing_paths: bool = False,
+    ) -> Iterator[Tuple[str, int]]:
+        yield from _expand_paths(paths, filesystem, partitioning, ignore_missing_paths)
+@DeveloperAPI
+class FastFileMetadataProvider(DefaultFileMetadataProvider):
+    """Fast Metadata provider for
+    :class:`~ray.data.datasource.file_based_datasource.FileBasedDatasource`
+    implementations.
+    Offers improved performance vs.
+    :class:`DefaultFileMetadataProvider`
+    by skipping directory path expansion and file size collection.
+    While this performance improvement may be negligible for local filesystems,
+    it can be substantial for cloud storage service providers.
+    This should only be used when all input paths exist and are known to be files.
+    """
+    def expand_paths(
+        self,
+        paths: List[str],
+        filesystem: "pyarrow.fs.FileSystem",
+        partitioning: Optional[Partitioning] = None,
+        ignore_missing_paths: bool = False,
+    ) -> Iterator[Tuple[str, int]]:
+        if ignore_missing_paths:
+            raise ValueError(
+                "`ignore_missing_paths` cannot be set when used with "
+                "`FastFileMetadataProvider`. All paths must exist when "
+                "using `FastFileMetadataProvider`."
+            )
+        logger.warning(
+            f"Skipping expansion of {len(paths)} path(s). If your paths contain "
+            f"directories or if file size collection is required, try rerunning this "
+            f"read with `meta_provider=DefaultFileMetadataProvider()`."
+        )
+        yield from zip(paths, itertools.repeat(None, len(paths)))
+def _handle_read_os_error(error: OSError, paths: Union[str, List[str]]) -> str:
+    # NOTE: this is not comprehensive yet, and should be extended as more errors arise.
+    # NOTE: The latter patterns are raised in Arrow 10+, while the former is raised in
+    # Arrow < 10.
+    aws_error_pattern = (
+        r"^(?:(.*)AWS Error \[code \d+\]: No response body\.(.*))|"
+        r"(?:(.*)AWS Error UNKNOWN \(HTTP status 400\) during HeadObject operation: "
+        r"No response body\.(.*))|"
+        r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
+        r"body\.(.*))$"
+    )
+    if re.match(aws_error_pattern, str(error)):
+        # Specially handle AWS error when reading files, to give a clearer error
+        # message to avoid confusing users. The real issue is most likely that the AWS
+        # S3 file credentials have not been properly configured yet.
+        if isinstance(paths, str):
+            # Quote to highlight single file path in error message for better
+            # readability. List of file paths will be shown up as ['foo', 'boo'],
+            # so only quote single file path here.
+            paths = f'"{paths}"'
+        raise OSError(
+            (
+                f"Failing to read AWS S3 file(s): {paths}. "
+                "Please check that file exists and has properly configured access. "
+                "You can also run AWS CLI command to get more detailed error message "
+                "(e.g., aws s3 ls <file-name>). "
+                "See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html "  # noqa
+                "and https://docs.ray.io/en/latest/data/creating-datasets.html#reading-from-remote-storage "  # noqa
+                "for more information."
+            )
+        )
+    else:
+        raise error
+def _expand_paths(
+    paths: List[str],
+    filesystem: "pyarrow.fs.FileSystem",
+    partitioning: Optional[Partitioning],
+    ignore_missing_paths: bool = False,
+) -> Iterator[Tuple[str, int]]:
+    """Get the file sizes for all provided file paths."""
+    from pyarrow.fs import LocalFileSystem
+    from ray.data.datasource.file_based_datasource import (
+        FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD,
+    )
+    from ray.data.datasource.path_util import _unwrap_protocol
+    # We break down our processing paths into a few key cases:
+    # 1. If len(paths) < threshold, fetch the file info for the individual files/paths
+    #    serially.
+    # 2. If all paths are contained under the same parent directory (or base directory,
+    #    if using partitioning), fetch all file infos at this prefix and filter to the
+    #    provided paths on the client; this should be a single file info request.
+    # 3. If more than threshold requests required, parallelize them via Ray tasks.
+    # 1. Small # of paths case.
+    if (
+        len(paths) < FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD
+        # Local file systems are very fast to hit.
+        or isinstance(filesystem, LocalFileSystem)
+    ):
+        yield from _get_file_infos_serial(paths, filesystem, ignore_missing_paths)
+    else:
+        # 2. Common path prefix case.
+        # Get longest common path of all paths.
+        common_path = os.path.commonpath(paths)
+        # If parent directory (or base directory, if using partitioning) is common to
+        # all paths, fetch all file infos at that prefix and filter the response to the
+        # provided paths.
+        if (
+            partitioning is not None
+            and common_path == _unwrap_protocol(partitioning.base_dir)
+        ) or all(str(pathlib.Path(path).parent) == common_path for path in paths):
+            yield from _get_file_infos_common_path_prefix(
+                paths, common_path, filesystem, ignore_missing_paths
+            )
+        # 3. Parallelization case.
+        else:
+            # Parallelize requests via Ray tasks.
+            yield from _get_file_infos_parallel(paths, filesystem, ignore_missing_paths)
+def _get_file_infos_serial(
+    paths: List[str],
+    filesystem: "pyarrow.fs.FileSystem",
+    ignore_missing_paths: bool = False,
+) -> Iterator[Tuple[str, int]]:
+    for path in paths:
+        yield from _get_file_infos(path, filesystem, ignore_missing_paths)
+def _get_file_infos_common_path_prefix(
+    paths: List[str],
+    common_path: str,
+    filesystem: "pyarrow.fs.FileSystem",
+    ignore_missing_paths: bool = False,
+) -> Iterator[Tuple[str, int]]:
+    path_to_size = {path: None for path in paths}
+    for path, file_size in _get_file_infos(
+        common_path, filesystem, ignore_missing_paths
+    ):
+        if path in path_to_size:
+            path_to_size[path] = file_size
+    # Check if all `paths` have file size metadata.
+    # If any of paths has no file size, fall back to get files metadata in parallel.
+    # This can happen when path is a directory, but not a file.
+    have_missing_path = False
+    for path in paths:
+        if path_to_size[path] is None:
+            logger.debug(
+                f"Finding path {path} not have file size metadata. "
+                "Fall back to get files metadata in parallel for all paths."
+            )
+            have_missing_path = True
+            break
+    if have_missing_path:
+        # Parallelize requests via Ray tasks.
+        yield from _get_file_infos_parallel(paths, filesystem, ignore_missing_paths)
+    else:
+        # Iterate over `paths` to yield each path in original order.
+        # NOTE: do not iterate over `path_to_size` because the dictionary skips
+        # duplicated path, while `paths` might contain duplicated path if one wants
+        # to read same file multiple times.
+        for path in paths:
+            yield path, path_to_size[path]
+def _get_file_infos_parallel(
+    paths: List[str],
+    filesystem: "pyarrow.fs.FileSystem",
+    ignore_missing_paths: bool = False,
+) -> Iterator[Tuple[str, int]]:
+    from ray.data.datasource.file_based_datasource import (
+        PATHS_PER_FILE_SIZE_FETCH_TASK,
+        _unwrap_s3_serialization_workaround,
+        _wrap_s3_serialization_workaround,
+    )
+    logger.warning(
+        f"Expanding {len(paths)} path(s). This may be a HIGH LATENCY "
+        f"operation on some cloud storage services. Moving all the "
+        "paths to a common parent directory will lead to faster "
+        "metadata fetching."
+    )
+    # Capture the filesystem in the fetcher func closure, but wrap it in our
+    # serialization workaround to make sure that the pickle roundtrip works as expected.
+    filesystem = _wrap_s3_serialization_workaround(filesystem)
+    def _file_infos_fetcher(paths: List[str]) -> List[Tuple[str, int]]:
+        fs = _unwrap_s3_serialization_workaround(filesystem)
+        return list(
+            itertools.chain.from_iterable(
+                _get_file_infos(path, fs, ignore_missing_paths) for path in paths
+            )
+        )
+    yield from _fetch_metadata_parallel(
+        paths, _file_infos_fetcher, PATHS_PER_FILE_SIZE_FETCH_TASK
+    )
+Uri = TypeVar("Uri")
+Meta = TypeVar("Meta")
+def _fetch_metadata_parallel(
+    uris: List[Uri],
+    fetch_func: Callable[[List[Uri]], List[Meta]],
+    desired_uris_per_task: int,
+    **ray_remote_args,
+) -> Iterator[Meta]:
+    """Fetch file metadata in parallel using Ray tasks."""
+    remote_fetch_func = cached_remote_fn(fetch_func)
+    if ray_remote_args:
+        remote_fetch_func = remote_fetch_func.options(**ray_remote_args)
+    # Choose a parallelism that results in a # of metadata fetches per task that
+    # dominates the Ray task overhead while ensuring good parallelism.
+    # Always launch at least 2 parallel fetch tasks.
+    parallelism = max(len(uris) // desired_uris_per_task, 2)
+    metadata_fetch_bar = ProgressBar(
+        "Metadata Fetch Progress", total=parallelism, unit="task"
+    )
+    fetch_tasks = []
+    for uri_chunk in np.array_split(uris, parallelism):
+        if len(uri_chunk) == 0:
+            continue
+        fetch_tasks.append(remote_fetch_func.remote(uri_chunk))
+    results = metadata_fetch_bar.fetch_until_complete(fetch_tasks)
+    yield from itertools.chain.from_iterable(results)
+def _get_file_infos(
+    path: str, filesystem: "pyarrow.fs.FileSystem", ignore_missing_path: bool = False
+) -> List[Tuple[str, int]]:
+    """Get the file info for all files at or under the provided path."""
+    from pyarrow.fs import FileType
+    file_infos = []
+    try:
+        ctx = ray.data.DataContext.get_current()
+        file_info = call_with_retry(
+            lambda: filesystem.get_file_info(path),
+            description="get file info",
+            match=ctx.retried_io_errors,
+        )
+    except OSError as e:
+        _handle_read_os_error(e, path)
+    if file_info.type == FileType.Directory:
+        for file_path, file_size in _expand_directory(path, filesystem):
+            file_infos.append((file_path, file_size))
+    elif file_info.type == FileType.File:
+        file_infos.append((path, file_info.size))
+    elif file_info.type == FileType.NotFound and ignore_missing_path:
+        pass
+    else:
+        raise FileNotFoundError(path)
+    return file_infos
+def _expand_directory(
+    path: str,
+    filesystem: "pyarrow.fs.FileSystem",
+    exclude_prefixes: Optional[List[str]] = None,
+    ignore_missing_path: bool = False,
+) -> List[Tuple[str, int]]:
+    """
+    Expand the provided directory path to a list of file paths.
+    Args:
+        path: The directory path to expand.
+        filesystem: The filesystem implementation that should be used for
+            reading these files.
+        exclude_prefixes: The file relative path prefixes that should be
+            excluded from the returned file set. Default excluded prefixes are
+            "." and "_".
+    Returns:
+        An iterator of (file_path, file_size) tuples.
+    """
+    if exclude_prefixes is None:
+        exclude_prefixes = [".", "_"]
+    from pyarrow.fs import FileSelector
+    selector = FileSelector(path, recursive=True, allow_not_found=ignore_missing_path)
+    files = filesystem.get_file_info(selector)
+    base_path = selector.base_dir
+    out = []
+    for file_ in files:
+        if not file_.is_file:
+            continue
+        file_path = file_.path
+        if not file_path.startswith(base_path):
+            continue
+        relative = file_path[len(base_path) :]
+        if any(relative.startswith(prefix) for prefix in exclude_prefixes):
+            continue
+        out.append((file_path, file_.size))
+    # We sort the paths to guarantee a stable order.
+    return sorted(out)

.venv/lib/python3.11/site-packages/ray/data/datasource/filename_provider.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from typing import Any, Dict, Optional
+from ray.data.block import Block
+from ray.util.annotations import PublicAPI
+@PublicAPI(stability="alpha")
+class FilenameProvider:
+    """Generates filenames when you write a :class:`~ray.data.Dataset`.
+    Use this class to customize the filenames used when writing a Dataset.
+    Some methods write each row to a separate file, while others write each block to a
+    separate file. For example, :meth:`ray.data.Dataset.write_images` writes individual
+    rows, and :func:`ray.data.Dataset.write_parquet` writes blocks of data. For more
+    information about blocks, see :ref:`Data internals <datasets_scheduling>`.
+    If you're writing each row to a separate file, implement
+    :meth:`~FilenameProvider.get_filename_for_row`. Otherwise, implement
+    :meth:`~FilenameProvider.get_filename_for_block`.
+    Example:
+        This snippet shows you how to encode labels in written files. For example, if
+        `"cat"` is a label, you might write a file named `cat_000000_000000_000000.png`.
+        .. testcode::
+            import ray
+            from ray.data.datasource import FilenameProvider
+            class ImageFilenameProvider(FilenameProvider):
+                def __init__(self, file_format: str):
+                    self.file_format = file_format
+                def get_filename_for_row(self, row, task_index, block_index, row_index):
+                    return (
+                        f"{row['label']}_{task_index:06}_{block_index:06}"
+                        f"_{row_index:06}.{self.file_format}"
+                    )
+            ds = ray.data.read_parquet("s3://anonymous@ray-example-data/images.parquet")
+            ds.write_images(
+                "/tmp/results",
+                column="image",
+                filename_provider=ImageFilenameProvider("png")
+            )
+    """  # noqa: E501
+    def get_filename_for_block(
+        self, block: Block, task_index: int, block_index: int
+    ) -> str:
+        """Generate a filename for a block of data.
+        .. note::
+            Filenames must be unique and deterministic for a given task and block index.
+            A block consists of multiple rows and corresponds to a single output file.
+            Each task might produce a different number of blocks.
+        Args:
+            block: The block that will be written to a file.
+            task_index: The index of the the write task.
+            block_index: The index of the block *within* the write task.
+        """
+        raise NotImplementedError
+    def get_filename_for_row(
+        self, row: Dict[str, Any], task_index: int, block_index: int, row_index: int
+    ) -> str:
+        """Generate a filename for a row.
+        .. note::
+            Filenames must be unique and deterministic for a given task, block, and row
+            index.
+            A block consists of multiple rows, and each row corresponds to a single
+            output file. Each task might produce a different number of blocks, and each
+            block might contain a different number of rows.
+        .. tip::
+            If you require a contiguous row index into the global dataset, use
+            :meth:`~ray.data.Dataset.iter_rows`. This method is single-threaded and
+            isn't recommended for large datasets.
+        Args:
+            row: The row that will be written to a file.
+            task_index: The index of the the write task.
+            block_index: The index of the block *within* the write task.
+            row_index: The index of the row *within* the block.
+        """
+        raise NotImplementedError
+class _DefaultFilenameProvider(FilenameProvider):
+    def __init__(
+        self, dataset_uuid: Optional[str] = None, file_format: Optional[str] = None
+    ):
+        self._dataset_uuid = dataset_uuid
+        self._file_format = file_format
+    def get_filename_for_block(
+        self, block: Block, task_index: int, block_index: int
+    ) -> str:
+        file_id = f"{task_index:06}_{block_index:06}"
+        return self._generate_filename(file_id)
+    def get_filename_for_row(
+        self, row: Dict[str, Any], task_index: int, block_index: int, row_index: int
+    ) -> str:
+        file_id = f"{task_index:06}_{block_index:06}_{row_index:06}"
+        return self._generate_filename(file_id)
+    def _generate_filename(self, file_id: str) -> str:
+        filename = ""
+        if self._dataset_uuid is not None:
+            filename += f"{self._dataset_uuid}_"
+        filename += file_id
+        if self._file_format is not None:
+            filename += f".{self._file_format}"
+        return filename

.venv/lib/python3.11/site-packages/ray/data/datasource/parquet_meta_provider.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from typing import TYPE_CHECKING, List, Optional, Union
+import ray.cloudpickle as cloudpickle
+from ray.data._internal.util import call_with_retry
+from ray.data.block import BlockMetadata
+from ray.data.datasource.file_meta_provider import (
+    FileMetadataProvider,
+    _fetch_metadata_parallel,
+)
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    import pyarrow
+    from ray.data._internal.datasource.parquet_datasource import SerializedFragment
+FRAGMENTS_PER_META_FETCH = 6
+PARALLELIZE_META_FETCH_THRESHOLD = 24
+# The application-level exceptions to retry for metadata prefetching task.
+# Default to retry on access denied and read timeout errors because AWS S3 would throw
+# these transient errors when load is too high.
+RETRY_EXCEPTIONS_FOR_META_FETCH_TASK = ["AWS Error ACCESS_DENIED", "Timeout"]
+# Maximum number of retries for metadata prefetching task due to transient errors.
+RETRY_MAX_ATTEMPTS_FOR_META_FETCH_TASK = 32
+# Maximum retry back-off interval in seconds for failed metadata prefetching task.
+RETRY_MAX_BACKOFF_S_FOR_META_FETCH_TASK = 64
+class _ParquetFileFragmentMetaData:
+    """Class to store metadata of a Parquet file fragment. This includes
+    all attributes from `pyarrow.parquet.FileMetaData` except for `schema`,
+    which is stored in `self.schema_pickled` as a pickled object from
+    `cloudpickle.loads()`, used in deduplicating schemas across multiple fragments."""
+    def __init__(self, fragment_metadata: "pyarrow.parquet.FileMetaData"):
+        self.created_by = fragment_metadata.created_by
+        self.format_version = fragment_metadata.format_version
+        self.num_columns = fragment_metadata.num_columns
+        self.num_row_groups = fragment_metadata.num_row_groups
+        self.num_rows = fragment_metadata.num_rows
+        self.serialized_size = fragment_metadata.serialized_size
+        # This is a pickled schema object, to be set later with
+        # `self.set_schema_pickled()`. To get the underlying schema, use
+        # `cloudpickle.loads(self.schema_pickled)`.
+        self.schema_pickled = None
+        # Calculate the total byte size of the file fragment using the original
+        # object, as it is not possible to access row groups from this class.
+        self.total_byte_size = 0
+        for row_group_idx in range(fragment_metadata.num_row_groups):
+            row_group_metadata = fragment_metadata.row_group(row_group_idx)
+            self.total_byte_size += row_group_metadata.total_byte_size
+    def set_schema_pickled(self, schema_pickled: bytes):
+        """Note: to get the underlying schema, use
+        `cloudpickle.loads(self.schema_pickled)`."""
+        self.schema_pickled = schema_pickled
+@DeveloperAPI
+class ParquetMetadataProvider(FileMetadataProvider):
+    """Provides block metadata for Arrow Parquet file fragments."""
+    def _get_block_metadata(
+        self,
+        paths: List[str],
+        schema: Optional[Union[type, "pyarrow.lib.Schema"]],
+        *,
+        num_fragments: int,
+        prefetched_metadata: Optional[List["_ParquetFileFragmentMetaData"]],
+    ) -> BlockMetadata:
+        """Resolves and returns block metadata for files of a single dataset block.
+        Args:
+            paths: The file paths for a single dataset block.
+            schema: The user-provided or inferred schema for the given file
+                paths, if any.
+            num_fragments: The number of Parquet file fragments derived from the input
+                file paths.
+            prefetched_metadata: Metadata previously returned from
+                `prefetch_file_metadata()` for each file fragment, where
+                `prefetched_metadata[i]` contains the metadata for `fragments[i]`.
+        Returns:
+            BlockMetadata aggregated across the given file paths.
+        """
+        if (
+            prefetched_metadata is not None
+            and len(prefetched_metadata) == num_fragments
+            and all(m is not None for m in prefetched_metadata)
+        ):
+            # Fragment metadata was available, construct a normal
+            # BlockMetadata.
+            block_metadata = BlockMetadata(
+                num_rows=sum(m.num_rows for m in prefetched_metadata),
+                size_bytes=sum(m.total_byte_size for m in prefetched_metadata),
+                schema=schema,
+                input_files=paths,
+                exec_stats=None,
+            )  # Exec stats filled in later.
+        else:
+            # Fragment metadata was not available, construct an empty
+            # BlockMetadata.
+            block_metadata = BlockMetadata(
+                num_rows=None,
+                size_bytes=None,
+                schema=schema,
+                input_files=paths,
+                exec_stats=None,
+            )
+        return block_metadata
+    def prefetch_file_metadata(
+        self,
+        fragments: List["pyarrow.dataset.ParquetFileFragment"],
+        **ray_remote_args,
+    ) -> Optional[List[_ParquetFileFragmentMetaData]]:
+        """Pre-fetches file metadata for all Parquet file fragments in a single batch.
+        Subsets of the metadata returned will be provided as input to subsequent calls
+        to ``_get_block_metadata`` together with their corresponding Parquet file
+        fragments.
+        Args:
+            fragments: The Parquet file fragments to fetch metadata for.
+        Returns:
+            Metadata resolved for each input file fragment, or `None`. Metadata
+            must be returned in the same order as all input file fragments, such
+            that `metadata[i]` always contains the metadata for `fragments[i]`.
+        """
+        from ray.data._internal.datasource.parquet_datasource import SerializedFragment
+        if len(fragments) > PARALLELIZE_META_FETCH_THRESHOLD:
+            # Wrap Parquet fragments in serialization workaround.
+            fragments = [SerializedFragment(fragment) for fragment in fragments]
+            # Fetch Parquet metadata in parallel using Ray tasks.
+            def fetch_func(fragments):
+                return _fetch_metadata_serialization_wrapper(
+                    fragments,
+                    # Ensure that retry settings are propagated to remote tasks.
+                    retry_match=RETRY_EXCEPTIONS_FOR_META_FETCH_TASK,
+                    retry_max_attempts=RETRY_MAX_ATTEMPTS_FOR_META_FETCH_TASK,
+                    retry_max_interval=RETRY_MAX_BACKOFF_S_FOR_META_FETCH_TASK,
+                )
+            raw_metadata = list(
+                _fetch_metadata_parallel(
+                    fragments,
+                    fetch_func,
+                    FRAGMENTS_PER_META_FETCH,
+                    **ray_remote_args,
+                )
+            )
+        else:
+            raw_metadata = _fetch_metadata(fragments)
+        return _dedupe_metadata(raw_metadata)
+def _fetch_metadata_serialization_wrapper(
+    fragments: List["SerializedFragment"],
+    retry_match: Optional[List[str]],
+    retry_max_attempts: int,
+    retry_max_interval: int,
+) -> List["pyarrow.parquet.FileMetaData"]:
+    from ray.data._internal.datasource.parquet_datasource import (
+        _deserialize_fragments_with_retry,
+    )
+    deserialized_fragments = _deserialize_fragments_with_retry(fragments)
+    try:
+        metadata = call_with_retry(
+            lambda: _fetch_metadata(deserialized_fragments),
+            description="fetch metdata",
+            match=retry_match,
+            max_attempts=retry_max_attempts,
+            max_backoff_s=retry_max_interval,
+        )
+    except OSError as e:
+        raise RuntimeError(
+            f"Exceeded maximum number of attempts ({retry_max_attempts}) to retry "
+            "metadata fetching task. Metadata fetching tasks can fail due to transient "
+            "errors like rate limiting.\n"
+            "\n"
+            "To increase the maximum number of attempts, configure "
+            "`RETRY_MAX_ATTEMPTS_FOR_META_FETCH_TASK`. For example:\n"
+            "```\n"
+            "ray.data._internal.datasource.parquet_datasource.RETRY_MAX_ATTEMPTS_FOR_META_FETCH_TASK = 64\n"  # noqa: E501
+            "```\n"
+            "To increase the maximum retry backoff interval, configure "
+            "`RETRY_MAX_BACKOFF_S_FOR_META_FETCH_TASK`. For example:\n"
+            "```\n"
+            "ray.data._internal.datasource.parquet_datasource.RETRY_MAX_BACKOFF_S_FOR_META_FETCH_TASK = 128\n"  # noqa: E501
+            "```\n"
+            "If the error continues to occur, you can also try decresasing the "
+            "concurency of metadata fetching tasks by setting "
+            "`NUM_CPUS_FOR_META_FETCH_TASK` to a larger value. For example:\n"
+            "```\n"
+            "ray.data._internal.datasource.parquet_datasource.NUM_CPUS_FOR_META_FETCH_TASK = 4.\n"  # noqa: E501
+            "```\n"
+            "To change which exceptions to retry on, set "
+            "`RETRY_EXCEPTIONS_FOR_META_FETCH_TASK` to a list of error messages. For "
+            "example:\n"
+            "```\n"
+            'ray.data._internal.datasource.parquet_datasource.RETRY_EXCEPTIONS_FOR_META_FETCH_TASK = ["AWS Error ACCESS_DENIED", "Timeout"]\n'  # noqa: E501
+            "```"
+        ) from e
+    return metadata
+def _fetch_metadata(
+    fragments: List["pyarrow.dataset.ParquetFileFragment"],
+) -> List["pyarrow.parquet.FileMetaData"]:
+    fragment_metadata = []
+    for f in fragments:
+        try:
+            fragment_metadata.append(f.metadata)
+        except AttributeError:
+            break
+    return fragment_metadata
+def _dedupe_metadata(
+    raw_metadatas: List["pyarrow.parquet.FileMetaData"],
+) -> List[_ParquetFileFragmentMetaData]:
+    """For datasets with a large number of columns, the FileMetaData
+    (in particular the schema) can be very large. We can reduce the
+    memory usage by only keeping unique schema objects across all
+    file fragments. This method deduplicates the schemas and returns
+    a list of `_ParquetFileFragmentMetaData` objects."""
+    schema_to_id = {}  # schema_id -> serialized_schema
+    id_to_schema = {}  # serialized_schema -> schema_id
+    stripped_metadatas = []
+    for fragment_metadata in raw_metadatas:
+        stripped_md = _ParquetFileFragmentMetaData(fragment_metadata)
+        schema_ser = cloudpickle.dumps(fragment_metadata.schema.to_arrow_schema())
+        if schema_ser not in schema_to_id:
+            schema_id = len(schema_to_id)
+            schema_to_id[schema_ser] = schema_id
+            id_to_schema[schema_id] = schema_ser
+            stripped_md.set_schema_pickled(schema_ser)
+        else:
+            schema_id = schema_to_id.get(schema_ser)
+            existing_schema_ser = id_to_schema[schema_id]
+            stripped_md.set_schema_pickled(existing_schema_ser)
+        stripped_metadatas.append(stripped_md)
+    return stripped_metadatas

.venv/lib/python3.11/site-packages/ray/data/exceptions.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import logging
+from typing import Callable
+from ray.data._internal.logging import get_log_directory
+from ray.data.context import DataContext
+from ray.exceptions import UserCodeException
+from ray.util import log_once
+from ray.util.annotations import DeveloperAPI
+from ray.util.rpdb import _is_ray_debugger_post_mortem_enabled
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class RayDataUserCodeException(UserCodeException):
+    """Represents an Exception originating from user code, e.g.
+    user-specified UDF used in a Ray Data transformation.
+    By default, the frames corresponding to Ray Data internal files are
+    omitted from the stack trace logged to stdout, but will still be
+    emitted to the Ray Data specific log file. To emit all stack frames to stdout,
+    set `DataContext.log_internal_stack_trace_to_stdout` to True."""
+    pass
+@DeveloperAPI
+class SystemException(Exception):
+    """Represents an Exception originating from Ray Data internal code
+    or Ray Core private code paths, as opposed to user code. When
+    Exceptions of this form are raised, it likely indicates a bug
+    in Ray Data or Ray Core."""
+    pass
+@DeveloperAPI
+def omit_traceback_stdout(fn: Callable) -> Callable:
+    """Decorator which runs the function, and if there is an exception raised,
+    drops the stack trace before re-raising the exception. The original exception,
+    including the full unmodified stack trace, is always written to the Ray Data
+    log file at `data_exception_logger._log_path`.
+    This is useful for stripping long stack traces of internal Ray Data code,
+    which can otherwise obfuscate user code errors."""
+    def handle_trace(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            # Only log the full internal stack trace to stdout when configured
+            # via DataContext, or when the Ray Debugger is enabled.
+            # The full stack trace will always be emitted to the Ray Data log file.
+            log_to_stdout = DataContext.get_current().log_internal_stack_trace_to_stdout
+            if _is_ray_debugger_post_mortem_enabled():
+                logger.exception("Full stack trace:")
+                raise e
+            is_user_code_exception = isinstance(e, UserCodeException)
+            if is_user_code_exception:
+                # Exception has occurred in user code.
+                if not log_to_stdout and log_once("ray_data_exception_internal_hidden"):
+                    logger.error(
+                        "Exception occurred in user code, with the abbreviated stack "
+                        "trace below. By default, the Ray Data internal stack trace "
+                        "is omitted from stdout, and only written to the Ray Data log "
+                        f"files at {get_log_directory()}. To "
+                        "output the full stack trace to stdout, set "
+                        "`DataContext.log_internal_stack_trace_to_stdout` to True."
+                    )
+            else:
+                # Exception has occurred in internal Ray Data / Ray Core code.
+                logger.error(
+                    "Exception occurred in Ray Data or Ray Core internal code. "
+                    "If you continue to see this error, please open an issue on "
+                    "the Ray project GitHub page with the full stack trace below: "
+                    "https://github.com/ray-project/ray/issues/new/choose"
+                )
+            should_hide_traceback = is_user_code_exception and not log_to_stdout
+            logger.exception(
+                "Full stack trace:",
+                exc_info=True,
+                extra={"hide": should_hide_traceback},
+            )
+            if is_user_code_exception:
+                raise e.with_traceback(None)
+            else:
+                raise e.with_traceback(None) from SystemException()
+    return handle_trace

.venv/lib/python3.11/site-packages/ray/data/grouped_data.py ADDED Viewed

	@@ -0,0 +1,494 @@

+from functools import partial
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from ray.data._internal.aggregate import Count, Max, Mean, Min, Std, Sum
+from ray.data._internal.compute import ComputeStrategy
+from ray.data._internal.logical.interfaces import LogicalPlan
+from ray.data._internal.logical.operators.all_to_all_operator import Aggregate
+from ray.data.aggregate import AggregateFn
+from ray.data.block import (
+    BlockAccessor,
+    CallableClass,
+    DataBatch,
+    UserDefinedFunction,
+    _get_block_boundaries,
+)
+from ray.data.dataset import Dataset
+from ray.util.annotations import PublicAPI
+CDS_API_GROUP = "Computations or Descriptive Stats"
+FA_API_GROUP = "Function Application"
+class GroupedData:
+    """Represents a grouped dataset created by calling ``Dataset.groupby()``.
+    The actual groupby is deferred until an aggregation is applied.
+    """
+    def __init__(
+        self,
+        dataset: Dataset,
+        key: Optional[Union[str, List[str]]],
+    ):
+        """Construct a dataset grouped by key (internal API).
+        The constructor is not part of the GroupedData API.
+        Use the ``Dataset.groupby()`` method to construct one.
+        """
+        self._dataset = dataset
+        self._key = key
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(dataset={self._dataset}, " f"key={self._key!r})"
+        )
+    @PublicAPI(api_group=FA_API_GROUP)
+    def aggregate(self, *aggs: AggregateFn) -> Dataset:
+        """Implements an accumulator-based aggregation.
+        Args:
+            aggs: Aggregations to do.
+        Returns:
+            The output is an dataset of ``n + 1`` columns where the first column
+            is the groupby key and the second through ``n + 1`` columns are the
+            results of the aggregations.
+            If groupby key is ``None`` then the key part of return is omitted.
+        """
+        plan = self._dataset._plan.copy()
+        op = Aggregate(
+            self._dataset._logical_plan.dag,
+            key=self._key,
+            aggs=aggs,
+        )
+        logical_plan = LogicalPlan(op, self._dataset.context)
+        return Dataset(
+            plan,
+            logical_plan,
+        )
+    def _aggregate_on(
+        self,
+        agg_cls: type,
+        on: Union[str, List[str]],
+        *args,
+        **kwargs,
+    ):
+        """Helper for aggregating on a particular subset of the dataset.
+        This validates the `on` argument, and converts a list of column names
+        to a multi-aggregation. A null `on` results in a
+        multi-aggregation on all columns for an Arrow Dataset, and a single
+        aggregation on the entire row for a simple Dataset.
+        """
+        aggs = self._dataset._build_multicolumn_aggs(
+            agg_cls, on, *args, skip_cols=self._key, **kwargs
+        )
+        return self.aggregate(*aggs)
+    @PublicAPI(api_group=FA_API_GROUP)
+    def map_groups(
+        self,
+        fn: UserDefinedFunction[DataBatch, DataBatch],
+        *,
+        compute: Union[str, ComputeStrategy] = None,
+        batch_format: Optional[str] = "default",
+        fn_args: Optional[Iterable[Any]] = None,
+        fn_kwargs: Optional[Dict[str, Any]] = None,
+        fn_constructor_args: Optional[Iterable[Any]] = None,
+        fn_constructor_kwargs: Optional[Dict[str, Any]] = None,
+        num_cpus: Optional[float] = None,
+        num_gpus: Optional[float] = None,
+        concurrency: Optional[Union[int, Tuple[int, int]]] = None,
+        **ray_remote_args,
+    ) -> "Dataset":
+        """Apply the given function to each group of records of this dataset.
+        While map_groups() is very flexible, note that it comes with downsides:
+            * It may be slower than using more specific methods such as min(), max().
+            * It requires that each group fits in memory on a single node.
+        In general, prefer to use aggregate() instead of map_groups().
+        .. warning::
+            Specifying both ``num_cpus`` and ``num_gpus`` for map tasks is experimental,
+            and may result in scheduling or stability issues. Please
+            `report any issues <https://github.com/ray-project/ray/issues/new/choose>`_
+            to the Ray team.
+        Examples:
+            >>> # Return a single record per group (list of multiple records in,
+            >>> # list of a single record out).
+            >>> import ray
+            >>> import pandas as pd
+            >>> import numpy as np
+            >>> # Get first value per group.
+            >>> ds = ray.data.from_items([ # doctest: +SKIP
+            ...     {"group": 1, "value": 1},
+            ...     {"group": 1, "value": 2},
+            ...     {"group": 2, "value": 3},
+            ...     {"group": 2, "value": 4}])
+            >>> ds.groupby("group").map_groups( # doctest: +SKIP
+            ...     lambda g: {"result": np.array([g["value"][0]])})
+            >>> # Return multiple records per group (dataframe in, dataframe out).
+            >>> df = pd.DataFrame(
+            ...     {"A": ["a", "a", "b"], "B": [1, 1, 3], "C": [4, 6, 5]}
+            ... )
+            >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
+            >>> grouped = ds.groupby("A") # doctest: +SKIP
+            >>> grouped.map_groups( # doctest: +SKIP
+            ...     lambda g: g.apply(
+            ...         lambda c: c / g[c.name].sum() if c.name in ["B", "C"] else c
+            ...     )
+            ... ) # doctest: +SKIP
+        Args:
+            fn: The function to apply to each group of records, or a class type
+                that can be instantiated to create such a callable. It takes as
+                input a batch of all records from a single group, and returns a
+                batch of zero or more records, similar to map_batches().
+            compute: The compute strategy, either "tasks" (default) to use Ray
+                tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor
+                pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an
+                autoscaling actor pool.
+            batch_format: Specify ``"default"`` to use the default block format
+                (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to
+                select ``pyarrow.Table``, or ``"numpy"`` to select
+                ``Dict[str, numpy.ndarray]``, or None to return the underlying block
+                exactly as is with no additional formatting.
+            fn_args: Arguments to `fn`.
+            fn_kwargs: Keyword arguments to `fn`.
+            fn_constructor_args: Positional arguments to pass to ``fn``'s constructor.
+                You can only provide this if ``fn`` is a callable class. These arguments
+                are top-level arguments in the underlying Ray actor construction task.
+            fn_constructor_kwargs: Keyword arguments to pass to ``fn``'s constructor.
+                This can only be provided if ``fn`` is a callable class. These arguments
+                are top-level arguments in the underlying Ray actor construction task.
+            num_cpus: The number of CPUs to reserve for each parallel map worker.
+            num_gpus: The number of GPUs to reserve for each parallel map worker. For
+                example, specify `num_gpus=1` to request 1 GPU for each parallel map
+                worker.
+            ray_remote_args: Additional resource requirements to request from
+                Ray (e.g., num_gpus=1 to request GPUs for the map tasks). See
+                :func:`ray.remote` for details.
+        Returns:
+            The return type is determined by the return type of ``fn``, and the return
+            value is combined from results of all groups.
+        """
+        # Globally sort records by key.
+        # Note that sort() will ensure that records of the same key partitioned
+        # into the same block.
+        if self._key is not None:
+            sorted_ds = self._dataset.sort(self._key)
+        else:
+            sorted_ds = self._dataset.repartition(1)
+        # The batch is the entire block, because we have batch_size=None for
+        # map_batches() below.
+        def apply_udf_to_groups(udf, batch, *args, **kwargs):
+            block = BlockAccessor.batch_to_block(batch)
+            block_accessor = BlockAccessor.for_block(block)
+            # Get the list of boundaries including first start and last end indices
+            if self._key:
+                projected_block = block_accessor.to_numpy(self._key)
+                # get_block_boundaries() expects a list of arrays
+                if isinstance(self._key, str):
+                    projected_block = [projected_block]
+                else:
+                    # projected_block is a dict of arrays
+                    projected_block = list(projected_block.values())
+                boundaries = _get_block_boundaries(projected_block)
+            else:
+                boundaries = [0, block_accessor.num_rows()]
+            for start, end in zip(boundaries[:-1], boundaries[1:]):
+                group_block = block_accessor.slice(start, end, copy=False)
+                group_block_accessor = BlockAccessor.for_block(group_block)
+                # Convert block of each group to batch format here, because the
+                # block format here can be different from batch format
+                # (e.g. block is Arrow format, and batch is NumPy format).
+                group_batch = group_block_accessor.to_batch_format(batch_format)
+                applied = udf(group_batch, *args, **kwargs)
+                yield applied
+        if isinstance(fn, CallableClass):
+            class wrapped_fn:
+                def __init__(self, *args, **kwargs):
+                    self.fn = fn(*args, **kwargs)
+                def __call__(self, batch, *args, **kwargs):
+                    yield from apply_udf_to_groups(self.fn, batch, *args, **kwargs)
+        else:
+            def wrapped_fn(batch, *args, **kwargs):
+                yield from apply_udf_to_groups(fn, batch, *args, **kwargs)
+        # Change the name of the wrapped function so that users see the name of their
+        # function rather than `wrapped_fn` in the progress bar.
+        if isinstance(fn, partial):
+            wrapped_fn.__name__ = fn.func.__name__
+        else:
+            wrapped_fn.__name__ = fn.__name__
+        # Note we set batch_size=None here, so it will use the entire block as a batch,
+        # which ensures that each group will be contained within a batch in entirety.
+        return sorted_ds._map_batches_without_batch_size_validation(
+            wrapped_fn,
+            batch_size=None,
+            compute=compute,
+            batch_format=batch_format,
+            zero_copy_batch=False,
+            fn_args=fn_args,
+            fn_kwargs=fn_kwargs,
+            fn_constructor_args=fn_constructor_args,
+            fn_constructor_kwargs=fn_constructor_kwargs,
+            num_cpus=num_cpus,
+            num_gpus=num_gpus,
+            concurrency=concurrency,
+            ray_remote_args_fn=None,
+            **ray_remote_args,
+        )
+    @PublicAPI(api_group=CDS_API_GROUP)
+    def count(self) -> Dataset:
+        """Compute count aggregation.
+        Examples:
+            >>> import ray
+            >>> ray.data.from_items([ # doctest: +SKIP
+            ...     {"A": x % 3, "B": x} for x in range(100)]).groupby( # doctest: +SKIP
+            ...     "A").count() # doctest: +SKIP
+        Returns:
+            A dataset of ``[k, v]`` columns where ``k`` is the groupby key and
+            ``v`` is the number of rows with that key.
+            If groupby key is ``None`` then the key part of return is omitted.
+        """
+        return self.aggregate(Count())
+    @PublicAPI(api_group=CDS_API_GROUP)
+    def sum(
+        self, on: Union[str, List[str]] = None, ignore_nulls: bool = True
+    ) -> Dataset:
+        r"""Compute grouped sum aggregation.
+        Examples:
+            >>> import ray
+            >>> ray.data.from_items([ # doctest: +SKIP
+            ...     (i % 3, i, i**2) # doctest: +SKIP
+            ...     for i in range(100)]) \ # doctest: +SKIP
+            ...     .groupby(lambda x: x[0] % 3) \ # doctest: +SKIP
+            ...     .sum(lambda x: x[2]) # doctest: +SKIP
+            >>> ray.data.range(100).groupby("id").sum() # doctest: +SKIP
+            >>> ray.data.from_items([ # doctest: +SKIP
+            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
+            ...     for i in range(100)]) \ # doctest: +SKIP
+            ...     .groupby("A") \ # doctest: +SKIP
+            ...     .sum(["B", "C"]) # doctest: +SKIP
+        Args:
+            on: a column name or a list of column names to aggregate.
+            ignore_nulls: Whether to ignore null values. If ``True``, null
+                values will be ignored when computing the sum; if ``False``,
+                if a null value is encountered, the output will be null.
+                We consider np.nan, None, and pd.NaT to be null values.
+                Default is ``True``.
+        Returns:
+            The sum result.
+            For different values of ``on``, the return varies:
+            - ``on=None``: a dataset containing a groupby key column,
+              ``"k"``, and a column-wise sum column for each original column
+              in the dataset.
+            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
+              columns where the first column is the groupby key and the second
+              through ``n + 1`` columns are the results of the aggregations.
+            If groupby key is ``None`` then the key part of return is omitted.
+        """
+        return self._aggregate_on(Sum, on, ignore_nulls=ignore_nulls)
+    @PublicAPI(api_group=CDS_API_GROUP)
+    def min(
+        self, on: Union[str, List[str]] = None, ignore_nulls: bool = True
+    ) -> Dataset:
+        """Compute grouped min aggregation.
+        Examples:
+            >>> import ray
+            >>> ray.data.le(100).groupby("value").min() # doctest: +SKIP
+            >>> ray.data.from_items([ # doctest: +SKIP
+            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
+            ...     for i in range(100)]) \ # doctest: +SKIP
+            ...     .groupby("A") \ # doctest: +SKIP
+            ...     .min(["B", "C"]) # doctest: +SKIP
+        Args:
+            on: a column name or a list of column names to aggregate.
+            ignore_nulls: Whether to ignore null values. If ``True``, null
+                values will be ignored when computing the min; if ``False``,
+                if a null value is encountered, the output will be null.
+                We consider np.nan, None, and pd.NaT to be null values.
+                Default is ``True``.
+        Returns:
+            The min result.
+            For different values of ``on``, the return varies:
+            - ``on=None``: a dataset containing a groupby key column,
+              ``"k"``, and a column-wise min column for each original column in
+              the dataset.
+            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
+              columns where the first column is the groupby key and the second
+              through ``n + 1`` columns are the results of the aggregations.
+            If groupby key is ``None`` then the key part of return is omitted.
+        """
+        return self._aggregate_on(Min, on, ignore_nulls=ignore_nulls)
+    @PublicAPI(api_group=CDS_API_GROUP)
+    def max(
+        self, on: Union[str, List[str]] = None, ignore_nulls: bool = True
+    ) -> Dataset:
+        """Compute grouped max aggregation.
+        Examples:
+            >>> import ray
+            >>> ray.data.le(100).groupby("value").max() # doctest: +SKIP
+            >>> ray.data.from_items([ # doctest: +SKIP
+            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
+            ...     for i in range(100)]) \ # doctest: +SKIP
+            ...     .groupby("A") \ # doctest: +SKIP
+            ...     .max(["B", "C"]) # doctest: +SKIP
+        Args:
+            on: a column name or a list of column names to aggregate.
+            ignore_nulls: Whether to ignore null values. If ``True``, null
+                values will be ignored when computing the max; if ``False``,
+                if a null value is encountered, the output will be null.
+                We consider np.nan, None, and pd.NaT to be null values.
+                Default is ``True``.
+        Returns:
+            The max result.
+            For different values of ``on``, the return varies:
+            - ``on=None``: a dataset containing a groupby key column,
+              ``"k"``, and a column-wise max column for each original column in
+              the dataset.
+            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
+              columns where the first column is the groupby key and the second
+              through ``n + 1`` columns are the results of the aggregations.
+            If groupby key is ``None`` then the key part of return is omitted.
+        """
+        return self._aggregate_on(Max, on, ignore_nulls=ignore_nulls)
+    @PublicAPI(api_group=CDS_API_GROUP)
+    def mean(
+        self, on: Union[str, List[str]] = None, ignore_nulls: bool = True
+    ) -> Dataset:
+        """Compute grouped mean aggregation.
+        Examples:
+            >>> import ray
+            >>> ray.data.le(100).groupby("value").mean() # doctest: +SKIP
+            >>> ray.data.from_items([ # doctest: +SKIP
+            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
+            ...     for i in range(100)]) \ # doctest: +SKIP
+            ...     .groupby("A") \ # doctest: +SKIP
+            ...     .mean(["B", "C"]) # doctest: +SKIP
+        Args:
+            on: a column name or a list of column names to aggregate.
+            ignore_nulls: Whether to ignore null values. If ``True``, null
+                values will be ignored when computing the mean; if ``False``,
+                if a null value is encountered, the output will be null.
+                We consider np.nan, None, and pd.NaT to be null values.
+                Default is ``True``.
+        Returns:
+            The mean result.
+            For different values of ``on``, the return varies:
+            - ``on=None``: a dataset containing a groupby key column,
+              ``"k"``, and a column-wise mean column for each original column
+              in the dataset.
+            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
+              columns where the first column is the groupby key and the second
+              through ``n + 1`` columns are the results of the aggregations.
+            If groupby key is ``None`` then the key part of return is omitted.
+        """
+        return self._aggregate_on(Mean, on, ignore_nulls=ignore_nulls)
+    @PublicAPI(api_group=CDS_API_GROUP)
+    def std(
+        self,
+        on: Union[str, List[str]] = None,
+        ddof: int = 1,
+        ignore_nulls: bool = True,
+    ) -> Dataset:
+        """Compute grouped standard deviation aggregation.
+        Examples:
+            >>> import ray
+            >>> ray.data.range(100).groupby("id").std(ddof=0) # doctest: +SKIP
+            >>> ray.data.from_items([ # doctest: +SKIP
+            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
+            ...     for i in range(100)]) \ # doctest: +SKIP
+            ...     .groupby("A") \ # doctest: +SKIP
+            ...     .std(["B", "C"]) # doctest: +SKIP
+        NOTE: This uses Welford's online method for an accumulator-style
+        computation of the standard deviation. This method was chosen due to
+        it's numerical stability, and it being computable in a single pass.
+        This may give different (but more accurate) results than NumPy, Pandas,
+        and sklearn, which use a less numerically stable two-pass algorithm.
+        See
+        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+        Args:
+            on: a column name or a list of column names to aggregate.
+            ddof: Delta Degrees of Freedom. The divisor used in calculations
+                is ``N - ddof``, where ``N`` represents the number of elements.
+            ignore_nulls: Whether to ignore null values. If ``True``, null
+                values will be ignored when computing the std; if ``False``,
+                if a null value is encountered, the output will be null.
+                We consider np.nan, None, and pd.NaT to be null values.
+                Default is ``True``.
+        Returns:
+            The standard deviation result.
+            For different values of ``on``, the return varies:
+            - ``on=None``: a dataset containing a groupby key column,
+              ``"k"``, and a column-wise std column for each original column in
+              the dataset.
+            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
+              columns where the first column is the groupby key and the second
+              through ``n + 1`` columns are the results of the aggregations.
+            If groupby key is ``None`` then the key part of return is omitted.
+        """
+        return self._aggregate_on(Std, on, ignore_nulls=ignore_nulls, ddof=ddof)
+# Backwards compatibility alias.
+GroupedDataset = GroupedData

.venv/lib/python3.11/site-packages/ray/data/iterator.py ADDED Viewed

	@@ -0,0 +1,931 @@

+import abc
+import time
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import numpy as np
+from ray.data._internal.block_batching.iter_batches import iter_batches
+from ray.data._internal.execution.interfaces import RefBundle
+from ray.data._internal.logical.operators.input_data_operator import InputData
+from ray.data._internal.logical.optimizers import LogicalPlan
+from ray.data._internal.plan import ExecutionPlan
+from ray.data._internal.stats import DatasetStats, StatsManager
+from ray.data.block import BlockAccessor, DataBatch, _apply_batch_format
+from ray.util.annotations import PublicAPI
+if TYPE_CHECKING:
+    import tensorflow as tf
+    import torch
+    from ray.data.dataset import (
+        CollatedData,
+        MaterializedDataset,
+        Schema,
+        TensorFlowTensorBatchType,
+        TorchBatchType,
+    )
+T = TypeVar("T")
+class _IterableFromIterator(Iterable[T]):
+    def __init__(self, iterator_gen: Callable[[], Iterator[T]]):
+        """Constructs an Iterable from an iterator generator.
+        Args:
+            iterator_gen: A function that returns an iterator each time it
+                is called. For example, this can be a generator function.
+        """
+        self.iterator_gen = iterator_gen
+    def __iter__(self):
+        return self.iterator_gen()
+@PublicAPI
+class DataIterator(abc.ABC):
+    """An iterator for reading records from a :class:`~Dataset`.
+    For Datasets, each iteration call represents a complete read of all items in the
+    Dataset.
+    If using Ray Train, each trainer actor should get its own iterator by calling
+    :meth:`ray.train.get_dataset_shard("train")
+    <ray.train.get_dataset_shard>`.
+    Examples:
+        >>> import ray
+        >>> ds = ray.data.range(5)
+        >>> ds
+        Dataset(num_rows=5, schema={id: int64})
+        >>> ds.iterator()
+        DataIterator(Dataset(num_rows=5, schema={id: int64}))
+    """
+    @abc.abstractmethod
+    def _to_ref_bundle_iterator(
+        self,
+    ) -> Tuple[Iterator[RefBundle], Optional[DatasetStats], bool]:
+        """Returns the iterator to use for `iter_batches`.
+        Returns:
+            A tuple. The first item of the tuple is an iterator over RefBundles.
+            The second item of the tuple is a DatasetStats object used for recording
+            stats during iteration.
+            The third item is a boolean indicating if the blocks can be safely cleared
+            after use.
+        """
+        raise NotImplementedError
+    @PublicAPI
+    def iter_batches(
+        self,
+        *,
+        prefetch_batches: int = 1,
+        batch_size: int = 256,
+        batch_format: Optional[str] = "default",
+        drop_last: bool = False,
+        local_shuffle_buffer_size: Optional[int] = None,
+        local_shuffle_seed: Optional[int] = None,
+        _collate_fn: Optional[Callable[[DataBatch], "CollatedData"]] = None,
+        _finalize_fn: Optional[Callable[[Any], Any]] = None,
+    ) -> Iterable[DataBatch]:
+        """Return a batched iterable over the dataset.
+        Examples:
+            >>> import ray
+            >>> for batch in ray.data.range(
+            ...     1000000
+            ... ).iterator().iter_batches(): # doctest: +SKIP
+            ...     print(batch) # doctest: +SKIP
+        Time complexity: O(1)
+        Args:
+            prefetch_batches: The number of batches to fetch ahead of the current batch
+                to fetch. If set to greater than 0, a separate threadpool will be used
+                to fetch the objects to the local node, format the batches, and apply
+                the collate_fn. Defaults to 1.
+            batch_size: The number of rows in each batch, or None to use entire blocks
+                as batches (blocks may contain different number of rows).
+                The final batch may include fewer than ``batch_size`` rows if
+                ``drop_last`` is ``False``. Defaults to 256.
+            batch_format: Specify ``"default"`` to use the default block format
+                (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to
+                select ``pyarrow.Table``, or ``"numpy"`` to select
+                ``Dict[str, numpy.ndarray]``, or None to return the underlying block
+                exactly as is with no additional formatting.
+            drop_last: Whether to drop the last batch if it's incomplete.
+            local_shuffle_buffer_size: If non-None, the data will be randomly shuffled
+                using a local in-memory shuffle buffer, and this value will serve as the
+                minimum number of rows that must be in the local in-memory shuffle
+                buffer in order to yield a batch. When there are no more rows to add to
+                the buffer, the remaining rows in the buffer will be drained.
+            local_shuffle_seed: The seed to use for the local random shuffle.
+        Returns:
+            An iterable over record batches.
+        """
+        batch_format = _apply_batch_format(batch_format)
+        def _create_iterator() -> Iterator[DataBatch]:
+            time_start = time.perf_counter()
+            # Iterate through the dataset from the start each time
+            # _iterator_gen is called.
+            # This allows multiple iterations of the dataset without
+            # needing to explicitly call `iter_batches()` multiple times.
+            (
+                ref_bundles_iterator,
+                stats,
+                blocks_owned_by_consumer,
+            ) = self._to_ref_bundle_iterator()
+            iterator = iter(
+                iter_batches(
+                    ref_bundles_iterator,
+                    stats=stats,
+                    clear_block_after_read=blocks_owned_by_consumer,
+                    batch_size=batch_size,
+                    batch_format=batch_format,
+                    drop_last=drop_last,
+                    collate_fn=_collate_fn,
+                    finalize_fn=_finalize_fn,
+                    shuffle_buffer_min_size=local_shuffle_buffer_size,
+                    shuffle_seed=local_shuffle_seed,
+                    prefetch_batches=prefetch_batches,
+                )
+            )
+            dataset_tag = self._get_dataset_tag()
+            if stats:
+                stats.iter_initialize_s.add(time.perf_counter() - time_start)
+            for batch in iterator:
+                yield batch
+                StatsManager.update_iteration_metrics(stats, dataset_tag)
+            StatsManager.clear_iteration_metrics(dataset_tag)
+            if stats:
+                stats.iter_total_s.add(time.perf_counter() - time_start)
+        return _IterableFromIterator(_create_iterator)
+    def _get_dataset_tag(self) -> str:
+        return "unknown_dataset"
+    @PublicAPI
+    def iter_rows(self) -> Iterable[Dict[str, Any]]:
+        """Return a local row iterable over the dataset.
+        If the dataset is a tabular dataset (Arrow/Pandas blocks), dicts
+        are yielded for each row by the iterator. If the dataset is not tabular,
+        the raw row is yielded.
+        Examples:
+            >>> import ray
+            >>> dataset = ray.data.range(10)
+            >>> next(iter(dataset.iterator().iter_rows()))
+            {'id': 0}
+        Time complexity: O(1)
+        Returns:
+            An iterable over rows of the dataset.
+        """
+        batch_iterable = self.iter_batches(
+            batch_size=None, batch_format=None, prefetch_batches=1
+        )
+        def _wrapped_iterator():
+            for batch in batch_iterable:
+                batch = BlockAccessor.for_block(BlockAccessor.batch_to_block(batch))
+                for row in batch.iter_rows(public_row_format=True):
+                    yield row
+        return _IterableFromIterator(_wrapped_iterator)
+    @abc.abstractmethod
+    @PublicAPI
+    def stats(self) -> str:
+        """Returns a string containing execution timing information."""
+        raise NotImplementedError
+    @abc.abstractmethod
+    def schema(self) -> "Schema":
+        """Return the schema of the dataset iterated over."""
+        raise NotImplementedError
+    @PublicAPI
+    def iter_torch_batches(
+        self,
+        *,
+        prefetch_batches: int = 1,
+        batch_size: Optional[int] = 256,
+        dtypes: Optional[Union["torch.dtype", Dict[str, "torch.dtype"]]] = None,
+        device: str = "auto",
+        collate_fn: Optional[Callable[[Dict[str, np.ndarray]], "CollatedData"]] = None,
+        drop_last: bool = False,
+        local_shuffle_buffer_size: Optional[int] = None,
+        local_shuffle_seed: Optional[int] = None,
+    ) -> Iterable["TorchBatchType"]:
+        """Return a batched iterable of Torch Tensors over the dataset.
+        This iterable yields a dictionary of column-tensors. If you are looking for
+        more flexibility in the tensor conversion (e.g. casting dtypes) or the batch
+        format, try using :meth:`~ray.data.DataIterator.iter_batches` directly.
+        Examples:
+            >>> import ray
+            >>> for batch in ray.data.range(
+            ...     12,
+            ... ).iterator().iter_torch_batches(batch_size=4):
+            ...     print(batch)
+            {'id': tensor([0, 1, 2, 3])}
+            {'id': tensor([4, 5, 6, 7])}
+            {'id': tensor([ 8,  9, 10, 11])}
+            Use the ``collate_fn`` to customize how the tensor batch is created.
+            >>> from typing import Any, Dict
+            >>> import torch
+            >>> import numpy as np
+            >>> import ray
+            >>> def collate_fn(batch: Dict[str, np.ndarray]) -> Any:
+            ...     return torch.stack(
+            ...         [torch.as_tensor(array) for array in batch.values()],
+            ...         axis=1
+            ...     )
+            >>> iterator = ray.data.from_items([
+            ...     {"col_1": 1, "col_2": 2},
+            ...     {"col_1": 3, "col_2": 4}]).iterator()
+            >>> for batch in iterator.iter_torch_batches(collate_fn=collate_fn):
+            ...     print(batch)
+            tensor([[1, 2],
+                    [3, 4]])
+        Time complexity: O(1)
+        Args:
+            prefetch_batches: The number of batches to fetch ahead of the current batch
+                to fetch. If set to greater than 0, a separate threadpool will be used
+                to fetch the objects to the local node, format the batches, and apply
+                the collate_fn. Defaults to 1.
+            batch_size: The number of rows in each batch, or None to use entire blocks
+                as batches (blocks may contain different number of rows).
+                The final batch may include fewer than ``batch_size`` rows if
+                ``drop_last`` is ``False``. Defaults to 256.
+            dtypes: The Torch dtype(s) for the created tensor(s); if None, the dtype
+                will be inferred from the tensor data. You can't use this parameter
+                with ``collate_fn``.
+            device: The device on which the tensor should be placed. Defaults to
+                "auto" which moves the tensors to the appropriate device when the
+                Dataset is passed to Ray Train and ``collate_fn`` is not provided.
+                Otherwise, defaults to CPU. You can't use this parameter with
+                ``collate_fn``.
+            collate_fn: A function to convert a Numpy batch to a PyTorch tensor batch.
+                When this parameter is specified, the user should manually handle the
+                host to device data transfer outside of ``collate_fn``.
+                This is useful for further processing the data after it has been
+                batched. Potential use cases include collating along a dimension other
+                than the first, padding sequences of various lengths, or generally
+                handling batches of different length tensors. If not provided, the
+                default collate function is used which simply converts the batch of
+                numpy arrays to a batch of PyTorch tensors. This API is still
+                experimental and is subject to change. You can't use this parameter in
+                conjunction with ``dtypes`` or ``device``.
+            drop_last: Whether to drop the last batch if it's incomplete.
+            local_shuffle_buffer_size: If non-None, the data will be randomly shuffled
+                using a local in-memory shuffle buffer, and this value will serve as the
+                minimum number of rows that must be in the local in-memory shuffle
+                buffer in order to yield a batch. When there are no more rows to add to
+                the buffer, the remaining rows in the buffer will be drained. This
+                buffer size must be greater than or equal to ``batch_size``, and
+                therefore ``batch_size`` must also be specified when using local
+                shuffling.
+            local_shuffle_seed: The seed to use for the local random shuffle.
+        Returns:
+            An iterable over Torch Tensor batches.
+        """
+        from ray.air._internal.torch_utils import (
+            convert_ndarray_batch_to_torch_tensor_batch,
+        )
+        from ray.train.torch import get_device
+        if collate_fn is not None and (dtypes is not None or device != "auto"):
+            raise ValueError(
+                "collate_fn cannot be used with dtypes and device."
+                "You should manually move the output Torch tensors to the"
+                "desired dtype and device outside of collate_fn."
+            )
+        if device == "auto":
+            # Use the appropriate device for Ray Train, or falls back to CPU if
+            # Ray Train is not being used.
+            device = get_device()
+        if collate_fn is None:
+            # The default collate_fn handles formatting and Tensor creation.
+            # Here, we set device=None to defer host to device data transfer
+            # to the subsequent finalize_fn.
+            def collate_fn(batch: Union[np.ndarray, Dict[str, np.ndarray]]):
+                return convert_ndarray_batch_to_torch_tensor_batch(
+                    batch,
+                    dtypes=dtypes,
+                    device=None,
+                )
+            # The default finalize_fn handles the host to device data transfer.
+            # This is executed in a 1-thread pool separately from collate_fn
+            # to allow independent parallelism of these steps.
+            def finalize_fn(batch: Union["torch.Tensor", Dict[str, "torch.Tensor"]]):
+                if device is not None:
+                    if isinstance(batch, dict):
+                        for k, t in batch.items():
+                            batch[k] = t.to(device=device)
+                    else:
+                        batch = batch.to(device=device)
+                return batch
+        else:
+            finalize_fn = None
+        return self.iter_batches(
+            prefetch_batches=prefetch_batches,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            local_shuffle_buffer_size=local_shuffle_buffer_size,
+            local_shuffle_seed=local_shuffle_seed,
+            _collate_fn=collate_fn,
+            _finalize_fn=finalize_fn,
+        )
+    def iter_tf_batches(
+        self,
+        *,
+        prefetch_batches: int = 1,
+        batch_size: Optional[int] = 256,
+        dtypes: Optional[Union["tf.dtypes.DType", Dict[str, "tf.dtypes.DType"]]] = None,
+        drop_last: bool = False,
+        local_shuffle_buffer_size: Optional[int] = None,
+        local_shuffle_seed: Optional[int] = None,
+    ) -> Iterable["TensorFlowTensorBatchType"]:
+        """Return a batched iterable of TensorFlow Tensors over the dataset.
+        This iterable will yield single-tensor batches of the underlying dataset
+        consists of a single column; otherwise, it will yield a dictionary of
+        column-tensors.
+        .. tip::
+            If you don't need the additional flexibility provided by this method,
+            consider using :meth:`~ray.data.Dataset.to_tf` instead. It's easier
+            to use.
+        Examples:
+            >>> import ray
+            >>> for batch in ray.data.range( # doctest: +SKIP
+            ...     12,
+            ... ).iter_tf_batches(batch_size=4):
+            ...     print(batch.shape) # doctest: +SKIP
+            (4, 1)
+            (4, 1)
+            (4, 1)
+        Time complexity: O(1)
+        Args:
+            prefetch_batches: The number of batches to fetch ahead of the current batch
+                to fetch. If set to greater than 0, a separate threadpool will be used
+                to fetch the objects to the local node, format the batches, and apply
+                the collate_fn. Defaults to 1.
+            batch_size: The number of rows in each batch, or None to use entire blocks
+                as batches (blocks may contain different number of rows).
+                The final batch may include fewer than ``batch_size`` rows if
+                ``drop_last`` is ``False``. Defaults to 256.
+            dtypes: The TensorFlow dtype(s) for the created tensor(s); if None, the
+                dtype will be inferred from the tensor data.
+            drop_last: Whether to drop the last batch if it's incomplete.
+            local_shuffle_buffer_size: If non-None, the data will be randomly shuffled
+                using a local in-memory shuffle buffer, and this value will serve as the
+                minimum number of rows that must be in the local in-memory shuffle
+                buffer in order to yield a batch. When there are no more rows to add to
+                the buffer, the remaining rows in the buffer will be drained. This
+                buffer size must be greater than or equal to ``batch_size``, and
+                therefore ``batch_size`` must also be specified when using local
+                shuffling.
+            local_shuffle_seed: The seed to use for the local random shuffle.
+        Returns:
+            An iterator over TensorFlow Tensor batches.
+        """
+        from ray.air._internal.tensorflow_utils import (
+            convert_ndarray_batch_to_tf_tensor_batch,
+        )
+        batch_iterable = self.iter_batches(
+            prefetch_batches=prefetch_batches,
+            batch_size=batch_size,
+            drop_last=drop_last,
+            local_shuffle_buffer_size=local_shuffle_buffer_size,
+            local_shuffle_seed=local_shuffle_seed,
+        )
+        mapped_iterable = map(
+            lambda batch: convert_ndarray_batch_to_tf_tensor_batch(
+                batch, dtypes=dtypes
+            ),
+            batch_iterable,
+        )
+        return mapped_iterable
+    def to_torch(
+        self,
+        *,
+        label_column: Optional[str] = None,
+        feature_columns: Optional[
+            Union[List[str], List[List[str]], Dict[str, List[str]]]
+        ] = None,
+        label_column_dtype: Optional["torch.dtype"] = None,
+        feature_column_dtypes: Optional[
+            Union["torch.dtype", List["torch.dtype"], Dict[str, "torch.dtype"]]
+        ] = None,
+        batch_size: int = 1,
+        prefetch_batches: int = 1,
+        drop_last: bool = False,
+        local_shuffle_buffer_size: Optional[int] = None,
+        local_shuffle_seed: Optional[int] = None,
+        unsqueeze_label_tensor: bool = True,
+        unsqueeze_feature_tensors: bool = True,
+    ) -> "torch.utils.data.IterableDataset":
+        """Return a Torch IterableDataset over this dataset.
+        This is only supported for datasets convertible to Arrow records.
+        It is recommended to use the returned ``IterableDataset`` directly
+        instead of passing it into a torch ``DataLoader``.
+        Each element in IterableDataset will be a tuple consisting of 2
+        elements. The first item contains the feature tensor(s), and the
+        second item is the label tensor. Those can take on different
+        forms, depending on the specified arguments.
+        For the features tensor (N is the ``batch_size`` and n, m, k
+        are the number of features per tensor):
+        * If ``feature_columns`` is a ``List[str]``, the features will be
+          a tensor of shape (N, n), with columns corresponding to
+          ``feature_columns``
+        * If ``feature_columns`` is a ``List[List[str]]``, the features will be
+          a list of tensors of shape [(N, m),...,(N, k)], with columns of each
+          tensor corresponding to the elements of ``feature_columns``
+        * If ``feature_columns`` is a ``Dict[str, List[str]]``, the features
+          will be a dict of key-tensor pairs of shape
+          {key1: (N, m),..., keyN: (N, k)}, with columns of each
+          tensor corresponding to the value of ``feature_columns`` under the
+          key.
+        If ``unsqueeze_label_tensor=True`` (default), the label tensor will be
+        of shape (N, 1). Otherwise, it will be of shape (N,).
+        If ``label_column`` is specified as ``None``, then no column from the
+        ``Dataset`` will be treated as the label, and the output label tensor
+        will be ``None``.
+        Note that you probably want to call ``.split()`` on this dataset if
+        there are to be multiple Torch workers consuming the data.
+        Time complexity: O(1)
+        Args:
+            label_column: The name of the column used as the
+                label (second element of the output list). Can be None for
+                prediction, in which case the second element of returned
+                tuple will also be None.
+            feature_columns: The names of the columns
+                to use as the features. Can be a list of lists or
+                a dict of string-list pairs for multi-tensor output.
+                If None, then use all columns except the label column as
+                the features.
+            label_column_dtype: The torch dtype to
+                use for the label column. If None, then automatically infer
+                the dtype.
+            feature_column_dtypes: The dtypes to use for the feature
+                tensors. This should match the format of ``feature_columns``,
+                or be a single dtype, in which case it will be applied to
+                all tensors. If None, then automatically infer the dtype.
+            batch_size: How many samples per batch to yield at a time.
+                Defaults to 1.
+            prefetch_batches: The number of batches to fetch ahead of the current batch
+                to fetch. If set to greater than 0, a separate threadpool will be used
+                to fetch the objects to the local node, format the batches, and apply
+                the collate_fn. Defaults to 1.
+            drop_last: Set to True to drop the last incomplete batch,
+                if the dataset size is not divisible by the batch size. If
+                False and the size of dataset is not divisible by the batch
+                size, then the last batch will be smaller. Defaults to False.
+            local_shuffle_buffer_size: If non-None, the data will be randomly shuffled
+                using a local in-memory shuffle buffer, and this value will serve as the
+                minimum number of rows that must be in the local in-memory shuffle
+                buffer in order to yield a batch. When there are no more rows to add to
+                the buffer, the remaining rows in the buffer will be drained. This
+                buffer size must be greater than or equal to ``batch_size``, and
+                therefore ``batch_size`` must also be specified when using local
+                shuffling.
+            local_shuffle_seed: The seed to use for the local random shuffle.
+            unsqueeze_label_tensor: If set to True, the label tensor
+                will be unsqueezed (reshaped to (N, 1)). Otherwise, it will
+                be left as is, that is (N, ). In general, regression loss
+                functions expect an unsqueezed tensor, while classification
+                loss functions expect a squeezed one. Defaults to True.
+            unsqueeze_feature_tensors: If set to True, the features tensors
+                will be unsqueezed (reshaped to (N, 1)) before being concatenated into
+                the final features tensor. Otherwise, they will be left as is, that is
+                (N, ). Defaults to True.
+        Returns:
+            A torch IterableDataset.
+        """
+        import torch
+        from ray.air._internal.torch_utils import convert_pandas_to_torch_tensor
+        from ray.data._internal.torch_iterable_dataset import TorchIterableDataset
+        # If an empty collection is passed in, treat it the same as None
+        if not feature_columns:
+            feature_columns = None
+        if feature_column_dtypes and not isinstance(feature_column_dtypes, torch.dtype):
+            if isinstance(feature_columns, dict):
+                if not isinstance(feature_column_dtypes, dict):
+                    raise TypeError(
+                        "If `feature_columns` is a dict, "
+                        "`feature_column_dtypes` must be None, `torch.dtype`,"
+                        f" or dict, got {type(feature_column_dtypes)}."
+                    )
+                if set(feature_columns) != set(feature_column_dtypes):
+                    raise ValueError(
+                        "`feature_columns` and `feature_column_dtypes` "
+                        "must have the same keys."
+                    )
+                if any(not subcolumns for subcolumns in feature_columns.values()):
+                    raise ValueError("column list may not be empty")
+            elif isinstance(feature_columns[0], (list, tuple)):
+                if not isinstance(feature_column_dtypes, (list, tuple)):
+                    raise TypeError(
+                        "If `feature_columns` is a list of lists, "
+                        "`feature_column_dtypes` must be None, `torch.dtype`,"
+                        f" or a sequence, got {type(feature_column_dtypes)}."
+                    )
+                if len(feature_columns) != len(feature_column_dtypes):
+                    raise ValueError(
+                        "`feature_columns` and `feature_column_dtypes` "
+                        "must have the same length."
+                    )
+                if any(not subcolumns for subcolumns in feature_columns):
+                    raise ValueError("column list may not be empty")
+        def make_generator():
+            for batch in self.iter_batches(
+                batch_size=batch_size,
+                batch_format="pandas",
+                prefetch_batches=prefetch_batches,
+                drop_last=drop_last,
+                local_shuffle_buffer_size=local_shuffle_buffer_size,
+                local_shuffle_seed=local_shuffle_seed,
+            ):
+                if label_column:
+                    label_tensor = convert_pandas_to_torch_tensor(
+                        batch,
+                        [label_column],
+                        label_column_dtype,
+                        unsqueeze=unsqueeze_label_tensor,
+                    )
+                    batch.pop(label_column)
+                else:
+                    label_tensor = None
+                if isinstance(feature_columns, dict):
+                    features_tensor = {
+                        key: convert_pandas_to_torch_tensor(
+                            batch,
+                            feature_columns[key],
+                            (
+                                feature_column_dtypes[key]
+                                if isinstance(feature_column_dtypes, dict)
+                                else feature_column_dtypes
+                            ),
+                            unsqueeze=unsqueeze_feature_tensors,
+                        )
+                        for key in feature_columns
+                    }
+                else:
+                    features_tensor = convert_pandas_to_torch_tensor(
+                        batch,
+                        columns=feature_columns,
+                        column_dtypes=feature_column_dtypes,
+                        unsqueeze=unsqueeze_feature_tensors,
+                    )
+                yield (features_tensor, label_tensor)
+        return TorchIterableDataset(make_generator)
+    @PublicAPI
+    def to_tf(
+        self,
+        feature_columns: Union[str, List[str]],
+        label_columns: Union[str, List[str]],
+        *,
+        additional_columns: Union[Optional[str], Optional[List[str]]] = None,
+        prefetch_batches: int = 1,
+        batch_size: int = 1,
+        drop_last: bool = False,
+        local_shuffle_buffer_size: Optional[int] = None,
+        local_shuffle_seed: Optional[int] = None,
+        feature_type_spec: Union["tf.TypeSpec", Dict[str, "tf.TypeSpec"]] = None,
+        label_type_spec: Union["tf.TypeSpec", Dict[str, "tf.TypeSpec"]] = None,
+        additional_type_spec: Union[
+            Optional["tf.TypeSpec"], Optional[Dict[str, "tf.TypeSpec"]]
+        ] = None,
+    ) -> "tf.data.Dataset":
+        """Return a TF Dataset over this dataset.
+        .. warning::
+            If your dataset contains ragged tensors, this method errors. To prevent
+            errors, :ref:`resize your tensors <transforming_tensors>`.
+        Examples:
+            >>> import ray
+            >>> ds = ray.data.read_csv(
+            ...     "s3://anonymous@air-example-data/iris.csv"
+            ... )
+            >>> it = ds.iterator(); it
+            DataIterator(Dataset(
+               num_rows=?,
+               schema={
+                  sepal length (cm): double,
+                  sepal width (cm): double,
+                  petal length (cm): double,
+                  petal width (cm): double,
+                  target: int64
+               }
+            ))
+            If your model accepts a single tensor as input, specify a single feature column.
+            >>> it.to_tf(feature_columns="sepal length (cm)", label_columns="target")
+            <_OptionsDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
+            If your model accepts a dictionary as input, specify a list of feature columns.
+            >>> it.to_tf(["sepal length (cm)", "sepal width (cm)"], "target")
+            <_OptionsDataset element_spec=({'sepal length (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), 'sepal width (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal width (cm)')}, TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
+            If your dataset contains multiple features but your model accepts a single
+            tensor as input, combine features with
+            :class:`~ray.data.preprocessors.Concatenator`.
+            >>> from ray.data.preprocessors import Concatenator
+            >>> columns_to_concat = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]
+            >>> preprocessor = Concatenator(columns=columns_to_concat, output_column_name="features")
+            >>> it = preprocessor.transform(ds).iterator()
+            >>> it
+            DataIterator(Concatenator
+            +- Dataset(
+                  num_rows=?,
+                  schema={
+                     sepal length (cm): double,
+                     sepal width (cm): double,
+                     petal length (cm): double,
+                     petal width (cm): double,
+                     target: int64
+                  }
+               ))
+            >>> it.to_tf("features", "target")
+            <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))>
+            If your model accepts different types, shapes, or names of tensors as input, specify the type spec.
+            If type specs are not specified, they are automatically inferred from the schema of the iterator.
+            >>> import tensorflow as tf
+            >>> it.to_tf(
+            ...     feature_columns="features",
+            ...     label_columns="target",
+            ...     feature_type_spec=tf.TensorSpec(shape=(None, 4), dtype=tf.float32, name="features"),
+            ...     label_type_spec=tf.TensorSpec(shape=(None,), dtype=tf.float32, name="label")
+            ... )
+            <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float32, name='features'), TensorSpec(shape=(None,), dtype=tf.float32, name='label'))>
+            If your model accepts additional metadata aside from features and label, specify a single additional column or a list of additional columns.
+            A common use case is to include sample weights in the data samples and train a ``tf.keras.Model`` with ``tf.keras.Model.fit``.
+            >>> import pandas as pd
+            >>> ds = ds.add_column("sample weights", lambda df: pd.Series([1] * len(df)))
+            >>> it = ds.iterator()
+            >>> it.to_tf(feature_columns="sepal length (cm)", label_columns="target", additional_columns="sample weights")
+            <_OptionsDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'), TensorSpec(shape=(None,), dtype=tf.int64, name='sample weights'))>
+            If your model accepts different types, shapes, or names for the additional metadata, specify the type spec of the additional column.
+            >>> it.to_tf(
+            ...     feature_columns="sepal length (cm)",
+            ...     label_columns="target",
+            ...     additional_columns="sample weights",
+            ...     additional_type_spec=tf.TensorSpec(shape=(None,), dtype=tf.float32, name="weight")
+            ... )
+            <_OptionsDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'), TensorSpec(shape=(None,), dtype=tf.float32, name='weight'))>
+        Args:
+            feature_columns: Columns that correspond to model inputs. If this is a
+                string, the input data is a tensor. If this is a list, the input data
+                is a ``dict`` that maps column names to their tensor representation.
+            label_columns: Columns that correspond to model targets. If this is a
+                string, the target data is a tensor. If this is a list, the target data
+                is a ``dict`` that maps column names to their tensor representation.
+            additional_columns: Columns that correspond to sample weights or other metadata.
+                If this is a string, the weight data is a tensor. If this is a list, the
+                weight data is a ``dict`` that maps column names to their tensor representation.
+            prefetch_batches: The number of batches to fetch ahead of the current batch
+                to fetch. If set to greater than 0, a separate threadpool will be used
+                to fetch the objects to the local node, format the batches, and apply
+                the collate_fn. Defaults to 1.
+            batch_size: Record batch size. Defaults to 1.
+            drop_last: Set to True to drop the last incomplete batch,
+                if the dataset size is not divisible by the batch size. If
+                False and the size of dataset is not divisible by the batch
+                size, then the last batch will be smaller. Defaults to False.
+            local_shuffle_buffer_size: If non-None, the data will be randomly shuffled
+                using a local in-memory shuffle buffer, and this value will serve as the
+                minimum number of rows that must be in the local in-memory shuffle
+                buffer in order to yield a batch. When there are no more rows to add to
+                the buffer, the remaining rows in the buffer will be drained. This
+                buffer size must be greater than or equal to ``batch_size``, and
+                therefore ``batch_size`` must also be specified when using local
+                shuffling.
+            local_shuffle_seed: The seed to use for the local random shuffle.
+            feature_type_spec: The `tf.TypeSpec` of `feature_columns`. If there is
+                only one column, specify a `tf.TypeSpec`. If there are multiple columns,
+                specify a ``dict`` that maps column names to their `tf.TypeSpec`.
+                Default is `None` to automatically infer the type of each column.
+            label_type_spec: The `tf.TypeSpec` of `label_columns`. If there is
+                only one column, specify a `tf.TypeSpec`. If there are multiple columns,
+                specify a ``dict`` that maps column names to their `tf.TypeSpec`.
+                Default is `None` to automatically infer the type of each column.
+            additional_type_spec: The `tf.TypeSpec` of `additional_columns`. If there
+                is only one column, specify a `tf.TypeSpec`. If there are multiple
+                columns, specify a ``dict`` that maps column names to their `tf.TypeSpec`.
+                Default is `None` to automatically infer the type of each column.
+        Returns:
+            A ``tf.data.Dataset`` that yields inputs and targets.
+        """  # noqa: E501
+        from ray.air._internal.tensorflow_utils import (
+            convert_ndarray_to_tf_tensor,
+            get_type_spec,
+        )
+        try:
+            import tensorflow as tf
+        except ImportError:
+            raise ValueError("tensorflow must be installed!")
+        def validate_column(column: str) -> None:
+            if column not in valid_columns:
+                raise ValueError(
+                    f"You specified '{column}' in `feature_columns`, "
+                    f"`label_columns`, or `additional_columns`, but there's no "
+                    f"column named '{column}' in the dataset. "
+                    f"Valid column names are: {valid_columns}."
+                )
+        def validate_columns(columns: Union[str, List]) -> None:
+            if isinstance(columns, list):
+                for column in columns:
+                    validate_column(column)
+            else:
+                validate_column(columns)
+        def convert_batch_to_tensors(
+            batch: Dict[str, np.ndarray],
+            *,
+            columns: Union[str, List[str]],
+            type_spec: Union[tf.TypeSpec, Dict[str, tf.TypeSpec]],
+        ) -> Union[tf.Tensor, Dict[str, tf.Tensor]]:
+            if isinstance(columns, str):
+                return convert_ndarray_to_tf_tensor(batch[columns], type_spec=type_spec)
+            return {
+                column: convert_ndarray_to_tf_tensor(
+                    batch[column], type_spec=type_spec[column]
+                )
+                for column in columns
+            }
+        def generator():
+            for batch in self.iter_batches(
+                prefetch_batches=prefetch_batches,
+                batch_size=batch_size,
+                drop_last=drop_last,
+                local_shuffle_buffer_size=local_shuffle_buffer_size,
+                local_shuffle_seed=local_shuffle_seed,
+            ):
+                assert isinstance(batch, dict)
+                features = convert_batch_to_tensors(
+                    batch, columns=feature_columns, type_spec=feature_type_spec
+                )
+                labels = convert_batch_to_tensors(
+                    batch, columns=label_columns, type_spec=label_type_spec
+                )
+                if additional_columns is None:
+                    yield features, labels
+                else:
+                    additional_metadata = convert_batch_to_tensors(
+                        batch,
+                        columns=additional_columns,
+                        type_spec=additional_type_spec,
+                    )
+                    yield features, labels, additional_metadata
+        if feature_type_spec is None or label_type_spec is None:
+            schema = self.schema()
+            valid_columns = set(schema.names)
+            validate_columns(feature_columns)
+            validate_columns(label_columns)
+            feature_type_spec = get_type_spec(schema, columns=feature_columns)
+            label_type_spec = get_type_spec(schema, columns=label_columns)
+        if additional_columns is not None and additional_type_spec is None:
+            schema = self.schema()
+            valid_columns = set(schema.names)
+            validate_columns(additional_columns)
+            additional_type_spec = get_type_spec(schema, columns=additional_columns)
+        if additional_columns is not None:
+            dataset = tf.data.Dataset.from_generator(
+                generator,
+                output_signature=(
+                    feature_type_spec,
+                    label_type_spec,
+                    additional_type_spec,
+                ),
+            )
+        else:
+            dataset = tf.data.Dataset.from_generator(
+                generator, output_signature=(feature_type_spec, label_type_spec)
+            )
+        options = tf.data.Options()
+        options.experimental_distribute.auto_shard_policy = (
+            tf.data.experimental.AutoShardPolicy.OFF
+        )
+        return dataset.with_options(options)
+    @PublicAPI
+    def materialize(self) -> "MaterializedDataset":
+        """Execute and materialize this data iterator into object store memory.
+        .. note::
+            This method triggers the execution and materializes all blocks
+            of the iterator, returning its contents as a
+            :class:`~ray.data.dataset.MaterializedDataset` for further processing.
+        """
+        from ray.data.dataset import MaterializedDataset
+        ref_bundles_iter, stats, _ = self._to_ref_bundle_iterator()
+        ref_bundles = list(ref_bundles_iter)
+        execution_plan = ExecutionPlan(stats)
+        logical_plan = LogicalPlan(
+            InputData(input_data=ref_bundles),
+            execution_plan._context,
+        )
+        return MaterializedDataset(
+            execution_plan,
+            logical_plan,
+        )
+    def __del__(self):
+        # Clear metrics on deletion in case the iterator was not fully consumed.
+        StatsManager.clear_iteration_metrics(self._get_dataset_tag())
+# Backwards compatibility alias.
+DatasetIterator = DataIterator

.venv/lib/python3.11/site-packages/ray/data/preprocessor.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import abc
+import base64
+import collections
+import pickle
+import warnings
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, Union
+from ray.air.util.data_batch_conversion import BatchFormat
+from ray.util.annotations import DeveloperAPI, PublicAPI
+if TYPE_CHECKING:
+    import numpy as np
+    import pandas as pd
+    from ray.air.data_batch_type import DataBatchType
+    from ray.data import Dataset
+@PublicAPI(stability="beta")
+class PreprocessorNotFittedException(RuntimeError):
+    """Error raised when the preprocessor needs to be fitted first."""
+    pass
+@PublicAPI(stability="beta")
+class Preprocessor(abc.ABC):
+    """Implements an ML preprocessing operation.
+    Preprocessors are stateful objects that can be fitted against a Dataset and used
+    to transform both local data batches and distributed data. For example, a
+    Normalization preprocessor may calculate the mean and stdev of a field during
+    fitting, and uses these attributes to implement its normalization transform.
+    Preprocessors can also be stateless and transform data without needed to be fitted.
+    For example, a preprocessor may simply remove a column, which does not require
+    any state to be fitted.
+    If you are implementing your own Preprocessor sub-class, you should override the
+    following:
+    * ``_fit`` if your preprocessor is stateful. Otherwise, set
+      ``_is_fittable=False``.
+    * ``_transform_pandas`` and/or ``_transform_numpy`` for best performance,
+      implement both. Otherwise, the data will be converted to the match the
+      implemented method.
+    """
+    class FitStatus(str, Enum):
+        """The fit status of preprocessor."""
+        NOT_FITTABLE = "NOT_FITTABLE"
+        NOT_FITTED = "NOT_FITTED"
+        # Only meaningful for Chain preprocessors.
+        # At least one contained preprocessor in the chain preprocessor
+        # is fitted and at least one that can be fitted is not fitted yet.
+        # This is a state that show up if caller only interacts
+        # with the chain preprocessor through intended Preprocessor APIs.
+        PARTIALLY_FITTED = "PARTIALLY_FITTED"
+        FITTED = "FITTED"
+    # Preprocessors that do not need to be fitted must override this.
+    _is_fittable = True
+    def _check_has_fitted_state(self):
+        """Checks if the Preprocessor has fitted state.
+        This is also used as an indiciation if the Preprocessor has been fit, following
+        convention from Ray versions prior to 2.6.
+        This allows preprocessors that have been fit in older versions of Ray to be
+        used to transform data in newer versions.
+        """
+        fitted_vars = [v for v in vars(self) if v.endswith("_")]
+        return bool(fitted_vars)
+    def fit_status(self) -> "Preprocessor.FitStatus":
+        if not self._is_fittable:
+            return Preprocessor.FitStatus.NOT_FITTABLE
+        elif (
+            hasattr(self, "_fitted") and self._fitted
+        ) or self._check_has_fitted_state():
+            return Preprocessor.FitStatus.FITTED
+        else:
+            return Preprocessor.FitStatus.NOT_FITTED
+    def fit(self, ds: "Dataset") -> "Preprocessor":
+        """Fit this Preprocessor to the Dataset.
+        Fitted state attributes will be directly set in the Preprocessor.
+        Calling it more than once will overwrite all previously fitted state:
+        ``preprocessor.fit(A).fit(B)`` is equivalent to ``preprocessor.fit(B)``.
+        Args:
+            ds: Input dataset.
+        Returns:
+            Preprocessor: The fitted Preprocessor with state attributes.
+        """
+        fit_status = self.fit_status()
+        if fit_status == Preprocessor.FitStatus.NOT_FITTABLE:
+            # No-op as there is no state to be fitted.
+            return self
+        if fit_status in (
+            Preprocessor.FitStatus.FITTED,
+            Preprocessor.FitStatus.PARTIALLY_FITTED,
+        ):
+            warnings.warn(
+                "`fit` has already been called on the preprocessor (or at least one "
+                "contained preprocessors if this is a chain). "
+                "All previously fitted state will be overwritten!"
+            )
+        fitted_ds = self._fit(ds)
+        self._fitted = True
+        return fitted_ds
+    def fit_transform(self, ds: "Dataset") -> "Dataset":
+        """Fit this Preprocessor to the Dataset and then transform the Dataset.
+        Calling it more than once will overwrite all previously fitted state:
+        ``preprocessor.fit_transform(A).fit_transform(B)``
+        is equivalent to ``preprocessor.fit_transform(B)``.
+        Args:
+            ds: Input Dataset.
+        Returns:
+            ray.data.Dataset: The transformed Dataset.
+        """
+        self.fit(ds)
+        return self.transform(ds)
+    def transform(self, ds: "Dataset") -> "Dataset":
+        """Transform the given dataset.
+        Args:
+            ds: Input Dataset.
+        Returns:
+            ray.data.Dataset: The transformed Dataset.
+        Raises:
+            PreprocessorNotFittedException: if ``fit`` is not called yet.
+        """
+        fit_status = self.fit_status()
+        if fit_status in (
+            Preprocessor.FitStatus.PARTIALLY_FITTED,
+            Preprocessor.FitStatus.NOT_FITTED,
+        ):
+            raise PreprocessorNotFittedException(
+                "`fit` must be called before `transform`, "
+                "or simply use fit_transform() to run both steps"
+            )
+        transformed_ds = self._transform(ds)
+        return transformed_ds
+    def transform_batch(self, data: "DataBatchType") -> "DataBatchType":
+        """Transform a single batch of data.
+        The data will be converted to the format supported by the Preprocessor,
+        based on which ``_transform_*`` methods are implemented.
+        Args:
+            data: Input data batch.
+        Returns:
+            DataBatchType:
+                The transformed data batch. This may differ
+                from the input type depending on which ``_transform_*`` methods
+                are implemented.
+        """
+        fit_status = self.fit_status()
+        if fit_status in (
+            Preprocessor.FitStatus.PARTIALLY_FITTED,
+            Preprocessor.FitStatus.NOT_FITTED,
+        ):
+            raise PreprocessorNotFittedException(
+                "`fit` must be called before `transform_batch`."
+            )
+        return self._transform_batch(data)
+    @DeveloperAPI
+    def _fit(self, ds: "Dataset") -> "Preprocessor":
+        """Sub-classes should override this instead of fit()."""
+        raise NotImplementedError()
+    def _determine_transform_to_use(self) -> BatchFormat:
+        """Determine which batch format to use based on Preprocessor implementation.
+        * If only `_transform_pandas` is implemented, then use ``pandas`` batch format.
+        * If only `_transform_numpy` is implemented, then use ``numpy`` batch format.
+        * If both are implemented, then use the Preprocessor defined preferred batch
+        format.
+        """
+        has_transform_pandas = (
+            self.__class__._transform_pandas != Preprocessor._transform_pandas
+        )
+        has_transform_numpy = (
+            self.__class__._transform_numpy != Preprocessor._transform_numpy
+        )
+        if has_transform_numpy and has_transform_pandas:
+            return self.preferred_batch_format()
+        elif has_transform_numpy:
+            return BatchFormat.NUMPY
+        elif has_transform_pandas:
+            return BatchFormat.PANDAS
+        else:
+            raise NotImplementedError(
+                "None of `_transform_numpy` or `_transform_pandas` are implemented. "
+                "At least one of these transform functions must be implemented "
+                "for Preprocessor transforms."
+            )
+    def _transform(self, ds: "Dataset") -> "Dataset":
+        # TODO(matt): Expose `batch_size` or similar configurability.
+        # The default may be too small for some datasets and too large for others.
+        transform_type = self._determine_transform_to_use()
+        # Our user-facing batch format should only be pandas or NumPy, other
+        # formats {arrow, simple} are internal.
+        kwargs = self._get_transform_config()
+        if transform_type == BatchFormat.PANDAS:
+            return ds.map_batches(
+                self._transform_pandas, batch_format=BatchFormat.PANDAS, **kwargs
+            )
+        elif transform_type == BatchFormat.NUMPY:
+            return ds.map_batches(
+                self._transform_numpy, batch_format=BatchFormat.NUMPY, **kwargs
+            )
+        else:
+            raise ValueError(
+                "Invalid transform type returned from _determine_transform_to_use; "
+                f'"pandas" and "numpy" allowed, but got: {transform_type}'
+            )
+    def _get_transform_config(self) -> Dict[str, Any]:
+        """Returns kwargs to be passed to :meth:`ray.data.Dataset.map_batches`.
+        This can be implemented by subclassing preprocessors.
+        """
+        return {}
+    def _transform_batch(self, data: "DataBatchType") -> "DataBatchType":
+        # For minimal install to locally import air modules
+        import numpy as np
+        import pandas as pd
+        from ray.air.util.data_batch_conversion import (
+            _convert_batch_type_to_numpy,
+            _convert_batch_type_to_pandas,
+        )
+        try:
+            import pyarrow
+        except ImportError:
+            pyarrow = None
+        if not isinstance(
+            data, (pd.DataFrame, pyarrow.Table, collections.abc.Mapping, np.ndarray)
+        ):
+            raise ValueError(
+                "`transform_batch` is currently only implemented for Pandas "
+                "DataFrames, pyarrow Tables, NumPy ndarray and dictionary of "
+                f"ndarray. Got {type(data)}."
+            )
+        transform_type = self._determine_transform_to_use()
+        if transform_type == BatchFormat.PANDAS:
+            return self._transform_pandas(_convert_batch_type_to_pandas(data))
+        elif transform_type == BatchFormat.NUMPY:
+            return self._transform_numpy(_convert_batch_type_to_numpy(data))
+    @DeveloperAPI
+    def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame":
+        """Run the transformation on a data batch in a Pandas DataFrame format."""
+        raise NotImplementedError()
+    @DeveloperAPI
+    def _transform_numpy(
+        self, np_data: Union["np.ndarray", Dict[str, "np.ndarray"]]
+    ) -> Union["np.ndarray", Dict[str, "np.ndarray"]]:
+        """Run the transformation on a data batch in a NumPy ndarray format."""
+        raise NotImplementedError()
+    @classmethod
+    @DeveloperAPI
+    def preferred_batch_format(cls) -> BatchFormat:
+        """Batch format hint for upstream producers to try yielding best block format.
+        The preferred batch format to use if both `_transform_pandas` and
+        `_transform_numpy` are implemented. Defaults to Pandas.
+        Can be overriden by Preprocessor classes depending on which transform
+        path is the most optimal.
+        """
+        return BatchFormat.PANDAS
+    @DeveloperAPI
+    def serialize(self) -> str:
+        """Return this preprocessor serialized as a string.
+        Note: this is not a stable serialization format as it uses `pickle`.
+        """
+        # Convert it to a plain string so that it can be included as JSON metadata
+        # in Trainer checkpoints.
+        return base64.b64encode(pickle.dumps(self)).decode("ascii")
+    @staticmethod
+    @DeveloperAPI
+    def deserialize(serialized: str) -> "Preprocessor":
+        """Load the original preprocessor serialized via `self.serialize()`."""
+        return pickle.loads(base64.b64decode(serialized))

.venv/lib/python3.11/site-packages/ray/data/random_access_dataset.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import bisect
+import logging
+import random
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, List, Optional
+import numpy as np
+import ray
+from ray.data._internal.execution.interfaces.ref_bundle import (
+    _ref_bundles_iterator_to_block_refs_list,
+)
+from ray.data._internal.remote_fn import cached_remote_fn
+from ray.data.block import BlockAccessor
+from ray.data.context import DataContext
+from ray.types import ObjectRef
+from ray.util.annotations import PublicAPI
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+if TYPE_CHECKING:
+    from ray.data import Dataset
+logger = logging.getLogger(__name__)
+@PublicAPI(stability="alpha")
+class RandomAccessDataset:
+    """A class that provides distributed, random access to a Dataset.
+    See: ``Dataset.to_random_access_dataset()``.
+    """
+    def __init__(
+        self,
+        ds: "Dataset",
+        key: str,
+        num_workers: int,
+    ):
+        """Construct a RandomAccessDataset (internal API).
+        The constructor is a private API. Use ``ds.to_random_access_dataset()``
+        to construct a RandomAccessDataset.
+        """
+        schema = ds.schema(fetch_if_missing=True)
+        if schema is None or isinstance(schema, type):
+            raise ValueError("RandomAccessDataset only supports Arrow-format blocks.")
+        start = time.perf_counter()
+        logger.info("[setup] Indexing dataset by sort key.")
+        sorted_ds = ds.sort(key)
+        get_bounds = cached_remote_fn(_get_bounds)
+        bundles = sorted_ds.iter_internal_ref_bundles()
+        blocks = _ref_bundles_iterator_to_block_refs_list(bundles)
+        logger.info("[setup] Computing block range bounds.")
+        bounds = ray.get([get_bounds.remote(b, key) for b in blocks])
+        self._non_empty_blocks = []
+        self._lower_bound = None
+        self._upper_bounds = []
+        for i, b in enumerate(bounds):
+            if b:
+                self._non_empty_blocks.append(blocks[i])
+                if self._lower_bound is None:
+                    self._lower_bound = b[0]
+                self._upper_bounds.append(b[1])
+        logger.info("[setup] Creating {} random access workers.".format(num_workers))
+        ctx = DataContext.get_current()
+        scheduling_strategy = ctx.scheduling_strategy
+        self._workers = [
+            _RandomAccessWorker.options(scheduling_strategy=scheduling_strategy).remote(
+                key
+            )
+            for _ in range(num_workers)
+        ]
+        (
+            self._block_to_workers_map,
+            self._worker_to_blocks_map,
+        ) = self._compute_block_to_worker_assignments()
+        logger.info(
+            "[setup] Worker to blocks assignment: {}".format(self._worker_to_blocks_map)
+        )
+        ray.get(
+            [
+                w.assign_blocks.remote(
+                    {
+                        i: self._non_empty_blocks[i]
+                        for i in self._worker_to_blocks_map[w]
+                    }
+                )
+                for w in self._workers
+            ]
+        )
+        logger.info("[setup] Finished assigning blocks to workers.")
+        self._build_time = time.perf_counter() - start
+    def _compute_block_to_worker_assignments(self):
+        # Return values.
+        block_to_workers: dict[int, List["ray.ActorHandle"]] = defaultdict(list)
+        worker_to_blocks: dict["ray.ActorHandle", List[int]] = defaultdict(list)
+        # Aux data structures.
+        loc_to_workers: dict[str, List["ray.ActorHandle"]] = defaultdict(list)
+        locs = ray.get([w.ping.remote() for w in self._workers])
+        for i, loc in enumerate(locs):
+            loc_to_workers[loc].append(self._workers[i])
+        block_locs = ray.experimental.get_object_locations(self._non_empty_blocks)
+        # First, try to assign all blocks to all workers at its location.
+        for block_idx, block in enumerate(self._non_empty_blocks):
+            block_info = block_locs[block]
+            locs = block_info.get("node_ids", [])
+            for loc in locs:
+                for worker in loc_to_workers[loc]:
+                    block_to_workers[block_idx].append(worker)
+                    worker_to_blocks[worker].append(block_idx)
+        # Randomly assign any leftover blocks to at least one worker.
+        # TODO: the load balancing here could be improved.
+        for block_idx, block in enumerate(self._non_empty_blocks):
+            if len(block_to_workers[block_idx]) == 0:
+                worker = random.choice(self._workers)
+                block_to_workers[block_idx].append(worker)
+                worker_to_blocks[worker].append(block_idx)
+        return block_to_workers, worker_to_blocks
+    def get_async(self, key: Any) -> ObjectRef[Any]:
+        """Asynchronously finds the record for a single key.
+        Args:
+            key: The key of the record to find.
+        Returns:
+            ObjectRef containing the record (in pydict form), or None if not found.
+        """
+        block_index = self._find_le(key)
+        if block_index is None:
+            return ray.put(None)
+        return self._worker_for(block_index).get.remote(block_index, key)
+    def multiget(self, keys: List[Any]) -> List[Optional[Any]]:
+        """Synchronously find the records for a list of keys.
+        Args:
+            keys: List of keys to find the records for.
+        Returns:
+            List of found records (in pydict form), or None for missing records.
+        """
+        batches = defaultdict(list)
+        for k in keys:
+            batches[self._find_le(k)].append(k)
+        futures = {}
+        for index, keybatch in batches.items():
+            if index is None:
+                continue
+            fut = self._worker_for(index).multiget.remote(
+                [index] * len(keybatch), keybatch
+            )
+            futures[index] = fut
+        results = {}
+        for i, fut in futures.items():
+            keybatch = batches[i]
+            values = ray.get(fut)
+            for k, v in zip(keybatch, values):
+                results[k] = v
+        return [results.get(k) for k in keys]
+    def stats(self) -> str:
+        """Returns a string containing access timing information."""
+        stats = ray.get([w.stats.remote() for w in self._workers])
+        total_time = sum(s["total_time"] for s in stats)
+        accesses = [s["num_accesses"] for s in stats]
+        blocks = [s["num_blocks"] for s in stats]
+        msg = "RandomAccessDataset:\n"
+        msg += "- Build time: {}s\n".format(round(self._build_time, 2))
+        msg += "- Num workers: {}\n".format(len(stats))
+        msg += "- Blocks per worker: {} min, {} max, {} mean\n".format(
+            min(blocks), max(blocks), int(sum(blocks) / len(blocks))
+        )
+        msg += "- Accesses per worker: {} min, {} max, {} mean\n".format(
+            min(accesses), max(accesses), int(sum(accesses) / len(accesses))
+        )
+        msg += "- Mean access time: {}us\n".format(
+            int(total_time / (1 + sum(accesses)) * 1e6)
+        )
+        return msg
+    def _worker_for(self, block_index: int):
+        return random.choice(self._block_to_workers_map[block_index])
+    def _find_le(self, x: Any) -> int:
+        i = bisect.bisect_left(self._upper_bounds, x)
+        if i >= len(self._upper_bounds) or x < self._lower_bound:
+            return None
+        return i
+@ray.remote(num_cpus=0)
+class _RandomAccessWorker:
+    def __init__(self, key_field):
+        self.blocks = None
+        self.key_field = key_field
+        self.num_accesses = 0
+        self.total_time = 0
+    def assign_blocks(self, block_ref_dict):
+        self.blocks = {k: ray.get(ref) for k, ref in block_ref_dict.items()}
+    def get(self, block_index, key):
+        start = time.perf_counter()
+        result = self._get(block_index, key)
+        self.total_time += time.perf_counter() - start
+        self.num_accesses += 1
+        return result
+    def multiget(self, block_indices, keys):
+        start = time.perf_counter()
+        block = self.blocks[block_indices[0]]
+        if len(set(block_indices)) == 1 and isinstance(
+            self.blocks[block_indices[0]], pa.Table
+        ):
+            # Fast path: use np.searchsorted for vectorized search on a single block.
+            # This is ~3x faster than the naive case.
+            block = self.blocks[block_indices[0]]
+            col = block[self.key_field]
+            indices = np.searchsorted(col, keys)
+            acc = BlockAccessor.for_block(block)
+            result = [acc._get_row(i) for i in indices]
+            # assert result == [self._get(i, k) for i, k in zip(block_indices, keys)]
+        else:
+            result = [self._get(i, k) for i, k in zip(block_indices, keys)]
+        self.total_time += time.perf_counter() - start
+        self.num_accesses += 1
+        return result
+    def ping(self):
+        return ray.get_runtime_context().get_node_id()
+    def stats(self) -> dict:
+        return {
+            "num_blocks": len(self.blocks),
+            "num_accesses": self.num_accesses,
+            "total_time": self.total_time,
+        }
+    def _get(self, block_index, key):
+        if block_index is None:
+            return None
+        block = self.blocks[block_index]
+        column = block[self.key_field]
+        if isinstance(block, pa.Table):
+            column = _ArrowListWrapper(column)
+        i = _binary_search_find(column, key)
+        if i is None:
+            return None
+        acc = BlockAccessor.for_block(block)
+        return acc._get_row(i)
+def _binary_search_find(column, x):
+    i = bisect.bisect_left(column, x)
+    if i != len(column) and column[i] == x:
+        return i
+    return None
+class _ArrowListWrapper:
+    def __init__(self, arrow_col):
+        self.arrow_col = arrow_col
+    def __getitem__(self, i):
+        return self.arrow_col[i].as_py()
+    def __len__(self):
+        return len(self.arrow_col)
+def _get_bounds(block, key):
+    if len(block) == 0:
+        return None
+    b = (block[key][0], block[key][len(block) - 1])
+    if isinstance(block, pa.Table):
+        b = (b[0].as_py(), b[1].as_py())
+    return b

.venv/lib/python3.11/site-packages/ray/data/read_api.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/ray/includes/__init__.pxd ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/includes/common.pxd ADDED Viewed

	@@ -0,0 +1,749 @@

+from libcpp cimport bool as c_bool
+from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.string cimport string as c_string
+from libc.stdint cimport uint8_t, int32_t, uint64_t, int64_t, uint32_t
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector as c_vector
+from libcpp.pair cimport pair as c_pair
+from ray.includes.optional cimport (
+    optional,
+)
+from ray.includes.unique_ids cimport (
+    CActorID,
+    CJobID,
+    CClusterID,
+    CWorkerID,
+    CObjectID,
+    CTaskID,
+    CPlacementGroupID,
+    CNodeID,
+)
+from ray.includes.function_descriptor cimport (
+    CFunctionDescriptor,
+)
+cdef extern from * namespace "polyfill" nogil:
+    """
+    namespace polyfill {
+    template <typename T>
+    inline typename std::remove_reference<T>::type&& move(T& t) {
+        return std::move(t);
+    }
+    template <typename T>
+    inline typename std::remove_reference<T>::type&& move(T&& t) {
+        return std::move(t);
+    }
+    }  // namespace polyfill
+    """
+    cdef T move[T](T)
+cdef extern from "ray/common/status.h" namespace "ray" nogil:
+    # TODO(ryw) in Cython 3.x we can directly use `cdef enum class CStatusCode`
+    cdef cppclass CStatusCode "ray::StatusCode":
+        pass
+    cdef CStatusCode CStatusCode_OK "ray::StatusCode::OK"
+    c_bool operator==(CStatusCode lhs, CStatusCode rhs)
+    cdef cppclass CRayStatus "ray::Status":
+        CRayStatus()
+        CRayStatus(CStatusCode code, const c_string &msg)
+        CRayStatus(CStatusCode code, const c_string &msg, int rpc_code)
+        CRayStatus(const CRayStatus &s)
+        @staticmethod
+        CRayStatus OK()
+        @staticmethod
+        CRayStatus OutOfMemory(const c_string &msg)
+        @staticmethod
+        CRayStatus KeyError(const c_string &msg)
+        @staticmethod
+        CRayStatus Invalid(const c_string &msg)
+        @staticmethod
+        CRayStatus IOError(const c_string &msg)
+        @staticmethod
+        CRayStatus TypeError(const c_string &msg)
+        @staticmethod
+        CRayStatus UnknownError(const c_string &msg)
+        @staticmethod
+        CRayStatus NotImplemented(const c_string &msg)
+        @staticmethod
+        CRayStatus ObjectStoreFull(const c_string &msg)
+        @staticmethod
+        CRayStatus RedisError(const c_string &msg)
+        @staticmethod
+        CRayStatus TimedOut(const c_string &msg)
+        @staticmethod
+        CRayStatus InvalidArgument(const c_string &msg)
+        @staticmethod
+        CRayStatus Interrupted(const c_string &msg)
+        @staticmethod
+        CRayStatus IntentionalSystemExit(const c_string &msg)
+        @staticmethod
+        CRayStatus UnexpectedSystemExit(const c_string &msg)
+        @staticmethod
+        CRayStatus CreationTaskError(const c_string &msg)
+        @staticmethod
+        CRayStatus NotFound()
+        @staticmethod
+        CRayStatus ObjectRefEndOfStream()
+        c_bool ok()
+        c_bool IsOutOfMemory()
+        c_bool IsKeyError()
+        c_bool IsInvalid()
+        c_bool IsIOError()
+        c_bool IsTypeError()
+        c_bool IsUnknownError()
+        c_bool IsNotImplemented()
+        c_bool IsObjectStoreFull()
+        c_bool IsAlreadyExists()
+        c_bool IsOutOfDisk()
+        c_bool IsRedisError()
+        c_bool IsTimedOut()
+        c_bool IsInvalidArgument()
+        c_bool IsInterrupted()
+        c_bool ShouldExitWorker()
+        c_bool IsObjectNotFound()
+        c_bool IsNotFound()
+        c_bool IsObjectUnknownOwner()
+        c_bool IsRpcError()
+        c_bool IsOutOfResource()
+        c_bool IsObjectRefEndOfStream()
+        c_bool IsIntentionalSystemExit()
+        c_bool IsUnexpectedSystemExit()
+        c_bool IsChannelError()
+        c_bool IsChannelTimeoutError()
+        c_string ToString()
+        c_string CodeAsString()
+        CStatusCode code()
+        c_string message()
+        int rpc_code()
+    # We can later add more of the common status factory methods as needed
+    cdef CRayStatus RayStatus_OK "Status::OK"()
+    cdef CRayStatus RayStatus_Invalid "Status::Invalid"()
+    cdef CRayStatus RayStatus_NotImplemented "Status::NotImplemented"()
+cdef extern from "ray/common/id.h" namespace "ray" nogil:
+    const CTaskID GenerateTaskId(const CJobID &job_id,
+                                 const CTaskID &parent_task_id,
+                                 int parent_task_counter)
+cdef extern from "src/ray/protobuf/common.pb.h" nogil:
+    cdef cppclass CLanguage "Language":
+        pass
+    cdef cppclass CWorkerType "ray::core::WorkerType":
+        pass
+    cdef cppclass CWorkerExitType "ray::rpc::WorkerExitType":
+        pass
+    cdef cppclass CTaskType "ray::TaskType":
+        pass
+    cdef cppclass CPlacementStrategy "ray::core::PlacementStrategy":
+        pass
+    cdef cppclass CDefaultSchedulingStrategy "ray::rpc::DefaultSchedulingStrategy":  # noqa: E501
+        CDefaultSchedulingStrategy()
+    cdef cppclass CSpreadSchedulingStrategy "ray::rpc::SpreadSchedulingStrategy":  # noqa: E501
+        CSpreadSchedulingStrategy()
+    cdef cppclass CPlacementGroupSchedulingStrategy "ray::rpc::PlacementGroupSchedulingStrategy":  # noqa: E501
+        CPlacementGroupSchedulingStrategy()
+        void set_placement_group_id(const c_string& placement_group_id)
+        void set_placement_group_bundle_index(int64_t placement_group_bundle_index)  # noqa: E501
+        void set_placement_group_capture_child_tasks(c_bool placement_group_capture_child_tasks)  # noqa: E501
+    cdef cppclass CNodeAffinitySchedulingStrategy "ray::rpc::NodeAffinitySchedulingStrategy":  # noqa: E501
+        CNodeAffinitySchedulingStrategy()
+        void set_node_id(const c_string& node_id)
+        void set_soft(c_bool soft)
+        void set_spill_on_unavailable(c_bool spill_on_unavailable)
+        void set_fail_on_unavailable(c_bool fail_on_unavailable)
+    cdef cppclass CSchedulingStrategy "ray::rpc::SchedulingStrategy":
+        CSchedulingStrategy()
+        void clear_scheduling_strategy()
+        CSpreadSchedulingStrategy* mutable_spread_scheduling_strategy()
+        CDefaultSchedulingStrategy* mutable_default_scheduling_strategy()
+        CPlacementGroupSchedulingStrategy* mutable_placement_group_scheduling_strategy()  # noqa: E501
+        CNodeAffinitySchedulingStrategy* mutable_node_affinity_scheduling_strategy()
+        CNodeLabelSchedulingStrategy* mutable_node_label_scheduling_strategy()
+    cdef cppclass CAddress "ray::rpc::Address":
+        CAddress()
+        const c_string &SerializeAsString() const
+        void ParseFromString(const c_string &serialized)
+        void CopyFrom(const CAddress& address)
+        const c_string &worker_id()
+    cdef cppclass CObjectReference "ray::rpc::ObjectReference":
+        CObjectReference()
+        CAddress owner_address() const
+        const c_string &object_id() const
+        const c_string &call_site() const
+    cdef cppclass CNodeLabelSchedulingStrategy "ray::rpc::NodeLabelSchedulingStrategy":  # noqa: E501
+        CNodeLabelSchedulingStrategy()
+        CLabelMatchExpressions* mutable_hard()
+        CLabelMatchExpressions* mutable_soft()
+    cdef cppclass CLabelMatchExpressions "ray::rpc::LabelMatchExpressions":  # noqa: E501
+        CLabelMatchExpressions()
+        CLabelMatchExpression* add_expressions()
+    cdef cppclass CLabelMatchExpression "ray::rpc::LabelMatchExpression":  # noqa: E501
+        CLabelMatchExpression()
+        void set_key(const c_string &key)
+        CLabelOperator* mutable_operator_()
+    cdef cppclass CLabelIn "ray::rpc::LabelIn":  # noqa: E501
+        CLabelIn()
+        void add_values(const c_string &value)
+    cdef cppclass CLabelNotIn "ray::rpc::LabelNotIn":  # noqa: E501
+        CLabelNotIn()
+        void add_values(const c_string &value)
+    cdef cppclass CLabelExists "ray::rpc::LabelExists":  # noqa: E501
+        CLabelExists()
+    cdef cppclass CLabelDoesNotExist "ray::rpc::LabelDoesNotExist":  # noqa: E501
+        CLabelDoesNotExist()
+    cdef cppclass CLabelNotIn "ray::rpc::LabelNotIn":  # noqa: E501
+        CLabelNotIn()
+        void add_values(const c_string &value)
+    cdef cppclass CLabelOperator "ray::rpc::LabelOperator":  # noqa: E501
+        CLabelOperator()
+        CLabelIn* mutable_label_in()
+        CLabelNotIn* mutable_label_not_in()
+        CLabelExists* mutable_label_exists()
+        CLabelDoesNotExist* mutable_label_does_not_exist()
+    cdef cppclass CLineageReconstructionTask "ray::rpc::LineageReconstructionTask":
+        CLineageReconstructionTask()
+        const c_string &SerializeAsString() const
+# This is a workaround for C++ enum class since Cython has no corresponding
+# representation.
+cdef extern from "src/ray/protobuf/common.pb.h" nogil:
+    cdef CLanguage LANGUAGE_PYTHON "Language::PYTHON"
+    cdef CLanguage LANGUAGE_CPP "Language::CPP"
+    cdef CLanguage LANGUAGE_JAVA "Language::JAVA"
+cdef extern from "src/ray/protobuf/common.pb.h" nogil:
+    cdef CWorkerType WORKER_TYPE_WORKER "ray::core::WorkerType::WORKER"
+    cdef CWorkerType WORKER_TYPE_DRIVER "ray::core::WorkerType::DRIVER"
+    cdef CWorkerType WORKER_TYPE_SPILL_WORKER "ray::core::WorkerType::SPILL_WORKER"  # noqa: E501
+    cdef CWorkerType WORKER_TYPE_RESTORE_WORKER "ray::core::WorkerType::RESTORE_WORKER"  # noqa: E501
+    cdef CWorkerType WORKER_TYPE_UTIL_WORKER "ray::core::WorkerType::UTIL_WORKER"  # noqa: E501
+    cdef CWorkerExitType WORKER_EXIT_TYPE_USER_ERROR "ray::rpc::WorkerExitType::USER_ERROR"  # noqa: E501
+    cdef CWorkerExitType WORKER_EXIT_TYPE_SYSTEM_ERROR "ray::rpc::WorkerExitType::SYSTEM_ERROR"  # noqa: E501
+    cdef CWorkerExitType WORKER_EXIT_TYPE_INTENTIONAL_SYSTEM_ERROR "ray::rpc::WorkerExitType::INTENDED_SYSTEM_EXIT"  # noqa: E501
+cdef extern from "src/ray/protobuf/common.pb.h" nogil:
+    cdef CTaskType TASK_TYPE_NORMAL_TASK "ray::TaskType::NORMAL_TASK"
+    cdef CTaskType TASK_TYPE_ACTOR_CREATION_TASK "ray::TaskType::ACTOR_CREATION_TASK"  # noqa: E501
+    cdef CTaskType TASK_TYPE_ACTOR_TASK "ray::TaskType::ACTOR_TASK"
+cdef extern from "src/ray/protobuf/common.pb.h" nogil:
+    cdef CPlacementStrategy PLACEMENT_STRATEGY_PACK \
+        "ray::core::PlacementStrategy::PACK"
+    cdef CPlacementStrategy PLACEMENT_STRATEGY_SPREAD \
+        "ray::core::PlacementStrategy::SPREAD"
+    cdef CPlacementStrategy PLACEMENT_STRATEGY_STRICT_PACK \
+        "ray::core::PlacementStrategy::STRICT_PACK"
+    cdef CPlacementStrategy PLACEMENT_STRATEGY_STRICT_SPREAD \
+        "ray::core::PlacementStrategy::STRICT_SPREAD"
+cdef extern from "ray/common/buffer.h" namespace "ray" nogil:
+    cdef cppclass CBuffer "ray::Buffer":
+        uint8_t *Data() const
+        size_t Size() const
+        c_bool IsPlasmaBuffer() const
+    cdef cppclass LocalMemoryBuffer(CBuffer):
+        LocalMemoryBuffer(uint8_t *data, size_t size, c_bool copy_data)
+        LocalMemoryBuffer(size_t size)
+    cdef cppclass SharedMemoryBuffer(CBuffer):
+        SharedMemoryBuffer(
+            const shared_ptr[CBuffer] &buffer,
+            int64_t offset,
+            int64_t size)
+        c_bool IsPlasmaBuffer() const
+cdef extern from "ray/common/ray_object.h" nogil:
+    cdef cppclass CRayObject "ray::RayObject":
+        CRayObject(const shared_ptr[CBuffer] &data,
+                   const shared_ptr[CBuffer] &metadata,
+                   const c_vector[CObjectReference] &nested_refs)
+        c_bool HasData() const
+        c_bool HasMetadata() const
+        const size_t DataSize() const
+        const shared_ptr[CBuffer] &GetData()
+        const shared_ptr[CBuffer] &GetMetadata() const
+        c_bool IsInPlasmaError() const
+cdef extern from "ray/core_worker/common.h" nogil:
+    cdef cppclass CRayFunction "ray::core::RayFunction":
+        CRayFunction()
+        CRayFunction(CLanguage language,
+                     const CFunctionDescriptor &function_descriptor)
+        CLanguage GetLanguage()
+        const CFunctionDescriptor GetFunctionDescriptor()
+    cdef cppclass CTaskArg "ray::TaskArg":
+        pass
+    cdef cppclass CTaskArgByReference "ray::TaskArgByReference":
+        CTaskArgByReference(const CObjectID &object_id,
+                            const CAddress &owner_address,
+                            const c_string &call_site)
+    cdef cppclass CTaskArgByValue "ray::TaskArgByValue":
+        CTaskArgByValue(const shared_ptr[CRayObject] &data)
+    cdef cppclass CTaskOptions "ray::core::TaskOptions":
+        CTaskOptions()
+        CTaskOptions(c_string name, int num_returns,
+                     unordered_map[c_string, double] &resources,
+                     c_string concurrency_group_name,
+                     int64_t generator_backpressure_num_objects)
+        CTaskOptions(c_string name, int num_returns,
+                     unordered_map[c_string, double] &resources,
+                     c_string concurrency_group_name,
+                     int64_t generator_backpressure_num_objects,
+                     c_string serialized_runtime_env)
+        CTaskOptions(c_string name, int num_returns,
+                     unordered_map[c_string, double] &resources,
+                     c_string concurrency_group_name,
+                     int64_t generator_backpressure_num_objects,
+                     c_string serialized_runtime_env, c_bool enable_task_events,
+                     const unordered_map[c_string, c_string] &labels)
+    cdef cppclass CActorCreationOptions "ray::core::ActorCreationOptions":
+        CActorCreationOptions()
+        CActorCreationOptions(
+            int64_t max_restarts,
+            int64_t max_task_retries,
+            int32_t max_concurrency,
+            const unordered_map[c_string, double] &resources,
+            const unordered_map[c_string, double] &placement_resources,
+            const c_vector[c_string] &dynamic_worker_options,
+            optional[c_bool] is_detached, c_string &name, c_string &ray_namespace,
+            c_bool is_asyncio,
+            const CSchedulingStrategy &scheduling_strategy,
+            c_string serialized_runtime_env,
+            const c_vector[CConcurrencyGroup] &concurrency_groups,
+            c_bool execute_out_of_order,
+            int32_t max_pending_calls,
+            c_bool enable_task_events,
+            const unordered_map[c_string, c_string] &labels)
+    cdef cppclass CPlacementGroupCreationOptions \
+            "ray::core::PlacementGroupCreationOptions":
+        CPlacementGroupCreationOptions()
+        CPlacementGroupCreationOptions(
+            const c_string &name,
+            CPlacementStrategy strategy,
+            const c_vector[unordered_map[c_string, double]] &bundles,
+            c_bool is_detached,
+            double max_cpu_fraction_per_node,
+            CNodeID soft_target_node_id,
+        )
+    cdef cppclass CObjectLocation "ray::core::ObjectLocation":
+        const CNodeID &GetPrimaryNodeID() const
+        const int64_t GetObjectSize() const
+        const c_vector[CNodeID] &GetNodeIDs() const
+        c_bool IsSpilled() const
+        const c_string &GetSpilledURL() const
+        const CNodeID &GetSpilledNodeID() const
+        const c_bool GetDidSpill() const
+cdef extern from "ray/gcs/gcs_client/python_callbacks.h" namespace "ray::gcs":
+    cdef cppclass MultiItemPyCallback[T]:
+        MultiItemPyCallback(
+            object (*)(CRayStatus, c_vector[T] &&) nogil,
+            void (object, object) nogil,
+            object) nogil
+    cdef cppclass OptionalItemPyCallback[T]:
+        OptionalItemPyCallback(
+            object (*)(CRayStatus, const optional[T]&) nogil,
+            void (object, object) nogil,
+            object) nogil
+    cdef cppclass StatusPyCallback:
+        StatusPyCallback(
+            object (*)(CRayStatus) nogil,
+            void (object, object) nogil,
+            object) nogil
+cdef extern from "ray/gcs/gcs_client/accessor.h" nogil:
+    cdef cppclass CActorInfoAccessor "ray::gcs::ActorInfoAccessor":
+        CRayStatus AsyncGetAllByFilter(
+            const optional[CActorID] &actor_id,
+            const optional[CJobID] &job_id,
+            const optional[c_string] &actor_state_name,
+            const MultiItemPyCallback[CActorTableData] &callback,
+            int64_t timeout_ms)
+        CRayStatus AsyncKillActor(const CActorID &actor_id,
+                                  c_bool force_kill,
+                                  c_bool no_restart,
+                                  const StatusPyCallback &callback,
+                                  int64_t timeout_ms)
+    cdef cppclass CJobInfoAccessor "ray::gcs::JobInfoAccessor":
+        CRayStatus GetAll(
+            const optional[c_string] &job_or_submission_id,
+            c_bool skip_submission_job_info_field,
+            c_bool skip_is_running_tasks_field,
+            c_vector[CJobTableData] &result,
+            int64_t timeout_ms)
+        CRayStatus AsyncGetAll(
+            const optional[c_string] &job_or_submission_id,
+            c_bool skip_submission_job_info_field,
+            c_bool skip_is_running_tasks_field,
+            const MultiItemPyCallback[CJobTableData] &callback,
+            int64_t timeout_ms)
+    cdef cppclass CNodeInfoAccessor "ray::gcs::NodeInfoAccessor":
+        CRayStatus CheckAlive(
+            const c_vector[c_string] &raylet_addresses,
+            int64_t timeout_ms,
+            c_vector[c_bool] &result)
+        CRayStatus AsyncCheckAlive(
+            const c_vector[c_string] &raylet_addresses,
+            int64_t timeout_ms,
+            const MultiItemPyCallback[c_bool] &callback)
+        CRayStatus DrainNodes(
+            const c_vector[CNodeID] &node_ids,
+            int64_t timeout_ms,
+            c_vector[c_string] &drained_node_ids)
+        CRayStatus GetAllNoCache(
+            int64_t timeout_ms,
+            c_vector[CGcsNodeInfo] &result)
+        CRayStatus AsyncGetAll(
+            const MultiItemPyCallback[CGcsNodeInfo] &callback,
+            int64_t timeout_ms,
+            optional[CNodeID] node_id)
+    cdef cppclass CNodeResourceInfoAccessor "ray::gcs::NodeResourceInfoAccessor":
+        CRayStatus GetAllResourceUsage(
+            int64_t timeout_ms,
+            CGetAllResourceUsageReply &serialized_reply)
+    cdef cppclass CInternalKVAccessor "ray::gcs::InternalKVAccessor":
+        CRayStatus Keys(
+            const c_string &ns,
+            const c_string &prefix,
+            int64_t timeout_ms,
+            c_vector[c_string] &value)
+        CRayStatus Put(
+            const c_string &ns,
+            const c_string &key,
+            const c_string &value,
+            c_bool overwrite,
+            int64_t timeout_ms,
+            c_bool &added)
+        CRayStatus Get(
+            const c_string &ns,
+            const c_string &key,
+            int64_t timeout_ms,
+            c_string &value)
+        CRayStatus MultiGet(
+            const c_string &ns,
+            const c_vector[c_string] &keys,
+            int64_t timeout_ms,
+            unordered_map[c_string, c_string] &values)
+        CRayStatus Del(
+            const c_string &ns,
+            const c_string &key,
+            c_bool del_by_prefix,
+            int64_t timeout_ms,
+            int& num_deleted)
+        CRayStatus Exists(
+            const c_string &ns,
+            const c_string &key,
+            int64_t timeout_ms,
+            c_bool &exists)
+        CRayStatus AsyncInternalKVKeys(
+            const c_string &ns,
+            const c_string &prefix,
+            int64_t timeout_ms,
+            const OptionalItemPyCallback[c_vector[c_string]] &callback)
+        CRayStatus AsyncInternalKVGet(
+            const c_string &ns,
+            const c_string &key,
+            int64_t timeout_ms,
+            const OptionalItemPyCallback[c_string] &callback)
+        CRayStatus AsyncInternalKVMultiGet(
+            const c_string &ns,
+            const c_vector[c_string] &keys,
+            int64_t timeout_ms,
+            const OptionalItemPyCallback[unordered_map[c_string, c_string]] &callback)
+        CRayStatus AsyncInternalKVPut(
+            const c_string &ns,
+            const c_string &key,
+            const c_string &value,
+            c_bool overwrite,
+            int64_t timeout_ms,
+            const OptionalItemPyCallback[c_bool] &callback)
+        CRayStatus AsyncInternalKVExists(
+            const c_string &ns,
+            const c_string &key,
+            int64_t timeout_ms,
+            const OptionalItemPyCallback[c_bool] &callback)
+        CRayStatus AsyncInternalKVDel(
+            const c_string &ns,
+            const c_string &key,
+            c_bool del_by_prefix,
+            int64_t timeout_ms,
+            const OptionalItemPyCallback[int] &callback)
+    cdef cppclass CRuntimeEnvAccessor "ray::gcs::RuntimeEnvAccessor":
+        CRayStatus PinRuntimeEnvUri(
+            const c_string &uri,
+            int expiration_s,
+            int64_t timeout_ms)
+    cdef cppclass CAutoscalerStateAccessor "ray::gcs::AutoscalerStateAccessor":
+        CRayStatus RequestClusterResourceConstraint(
+            int64_t timeout_ms,
+            const c_vector[unordered_map[c_string, double]] &bundles,
+            const c_vector[int64_t] &count_array
+        )
+        CRayStatus GetClusterResourceState(
+            int64_t timeout_ms,
+            c_string &serialized_reply
+        )
+        CRayStatus GetClusterStatus(
+            int64_t timeout_ms,
+            c_string &serialized_reply
+        )
+        CRayStatus ReportAutoscalingState(
+            int64_t timeout_ms,
+            const c_string &serialized_state
+        )
+        CRayStatus ReportClusterConfig(
+            int64_t timeout_ms,
+            const c_string &serialized_cluster_config
+        )
+        CRayStatus DrainNode(
+            const c_string &node_id,
+            int32_t reason,
+            const c_string &reason_message,
+            int64_t deadline_timestamp_ms,
+            int64_t timeout_ms,
+            c_bool &is_accepted,
+            c_string &rejection_reason_message
+        )
+cdef extern from "ray/gcs/gcs_client/gcs_client.h" nogil:
+    cdef enum CGrpcStatusCode "grpc::StatusCode":
+        UNAVAILABLE "grpc::StatusCode::UNAVAILABLE",
+        UNKNOWN "grpc::StatusCode::UNKNOWN",
+        DEADLINE_EXCEEDED "grpc::StatusCode::DEADLINE_EXCEEDED",
+        RESOURCE_EXHAUSTED "grpc::StatusCode::RESOURCE_EXHAUSTED",
+        UNIMPLEMENTED "grpc::StatusCode::UNIMPLEMENTED",
+    cdef cppclass CGcsClientOptions "ray::gcs::GcsClientOptions":
+        CGcsClientOptions(
+            const c_string &gcs_address, int port, CClusterID cluster_id,
+            c_bool allow_cluster_id_nil, c_bool fetch_cluster_id_if_nil)
+    cdef cppclass CGcsClient "ray::gcs::GcsClient":
+        CGcsClient(const CGcsClientOptions &options)
+        c_pair[c_string, int] GetGcsServerAddress() const
+        CClusterID GetClusterId() const
+        CActorInfoAccessor& Actors()
+        CJobInfoAccessor& Jobs()
+        CInternalKVAccessor& InternalKV()
+        CNodeInfoAccessor& Nodes()
+        CNodeResourceInfoAccessor& NodeResources()
+        CRuntimeEnvAccessor& RuntimeEnvs()
+        CAutoscalerStateAccessor& Autoscaler()
+    cdef CRayStatus ConnectOnSingletonIoContext(CGcsClient &gcs_client, int timeout_ms)
+cdef extern from "ray/gcs/gcs_client/gcs_client.h" namespace "ray::gcs" nogil:
+    unordered_map[c_string, double] PythonGetResourcesTotal(
+        const CGcsNodeInfo& node_info)
+cdef extern from "ray/gcs/pubsub/gcs_pub_sub.h" nogil:
+    cdef cppclass CPythonGcsPublisher "ray::gcs::PythonGcsPublisher":
+        CPythonGcsPublisher(const c_string& gcs_address)
+        CRayStatus Connect()
+        CRayStatus PublishError(
+            const c_string &key_id, const CErrorTableData &data, int64_t num_retries)
+        CRayStatus PublishLogs(const c_string &key_id, const CLogBatch &data)
+    cdef cppclass CPythonGcsSubscriber "ray::gcs::PythonGcsSubscriber":
+        CPythonGcsSubscriber(
+            const c_string& gcs_address, int gcs_port, CChannelType channel_type,
+            const c_string& subscriber_id, const c_string& worker_id)
+        CRayStatus Subscribe()
+        int64_t last_batch_size()
+        CRayStatus PollError(
+            c_string* key_id, int64_t timeout_ms, CErrorTableData* data)
+        CRayStatus PollLogs(
+            c_string* key_id, int64_t timeout_ms, CLogBatch* data)
+        CRayStatus PollActor(
+            c_string* key_id, int64_t timeout_ms, CActorTableData* data)
+        CRayStatus Close()
+cdef extern from "ray/gcs/pubsub/gcs_pub_sub.h" namespace "ray::gcs" nogil:
+    c_vector[c_string] PythonGetLogBatchLines(const CLogBatch& log_batch)
+cdef extern from "ray/gcs/gcs_client/gcs_client.h" namespace "ray::gcs" nogil:
+    unordered_map[c_string, c_string] PythonGetNodeLabels(
+        const CGcsNodeInfo& node_info)
+cdef extern from "src/ray/protobuf/gcs.pb.h" nogil:
+    cdef enum CChannelType "ray::rpc::ChannelType":
+        RAY_ERROR_INFO_CHANNEL "ray::rpc::ChannelType::RAY_ERROR_INFO_CHANNEL",
+        RAY_LOG_CHANNEL "ray::rpc::ChannelType::RAY_LOG_CHANNEL",
+        GCS_ACTOR_CHANNEL "ray::rpc::ChannelType::GCS_ACTOR_CHANNEL",
+    cdef cppclass CJobConfig "ray::rpc::JobConfig":
+        c_string ray_namespace() const
+        const c_string &SerializeAsString() const
+    cdef cppclass CNodeDeathInfo "ray::rpc::NodeDeathInfo":
+        int reason() const
+        c_string reason_message() const
+    cdef cppclass CGcsNodeInfo "ray::rpc::GcsNodeInfo":
+        c_string node_id() const
+        c_string node_name() const
+        int state() const
+        c_string node_manager_address() const
+        c_string node_manager_hostname() const
+        int node_manager_port() const
+        int object_manager_port() const
+        c_string object_store_socket_name() const
+        c_string raylet_socket_name() const
+        int metrics_export_port() const
+        int runtime_env_agent_port() const
+        CNodeDeathInfo death_info() const
+        void ParseFromString(const c_string &serialized)
+        const c_string& SerializeAsString() const
+    cdef enum CGcsNodeState "ray::rpc::GcsNodeInfo_GcsNodeState":
+        ALIVE "ray::rpc::GcsNodeInfo_GcsNodeState_ALIVE",
+    cdef cppclass CJobTableData "ray::rpc::JobTableData":
+        c_string job_id() const
+        c_bool is_dead() const
+        CJobConfig config() const
+        const c_string &SerializeAsString() const
+    cdef cppclass CGetAllResourceUsageReply "ray::rpc::GetAllResourceUsageReply":
+        const c_string& SerializeAsString() const
+    cdef cppclass CPythonFunction "ray::rpc::PythonFunction":
+        void set_key(const c_string &key)
+        c_string key() const
+    cdef cppclass CErrorTableData "ray::rpc::ErrorTableData":
+        c_string job_id() const
+        c_string type() const
+        c_string error_message() const
+        double timestamp() const
+        void set_job_id(const c_string &job_id)
+        void set_type(const c_string &type)
+        void set_error_message(const c_string &error_message)
+        void set_timestamp(double timestamp)
+    cdef cppclass CLogBatch "ray::rpc::LogBatch":
+        c_string ip() const
+        c_string pid() const
+        c_string job_id() const
+        c_bool is_error() const
+        c_string actor_name() const
+        c_string task_name() const
+        void set_ip(const c_string &ip)
+        void set_pid(const c_string &pid)
+        void set_job_id(const c_string &job_id)
+        void set_is_error(c_bool is_error)
+        void add_lines(const c_string &line)
+        void set_actor_name(const c_string &actor_name)
+        void set_task_name(const c_string &task_name)
+    cdef cppclass CActorTableData "ray::rpc::ActorTableData":
+        CAddress address() const
+        void ParseFromString(const c_string &serialized)
+        const c_string &SerializeAsString() const
+cdef extern from "ray/common/task/task_spec.h" nogil:
+    cdef cppclass CConcurrencyGroup "ray::ConcurrencyGroup":
+        CConcurrencyGroup(
+            const c_string &name,
+            uint32_t max_concurrency,
+            const c_vector[CFunctionDescriptor] &c_fds)
+        CConcurrencyGroup()
+        c_string GetName() const
+        uint32_t GetMaxConcurrency() const
+        c_vector[CFunctionDescriptor] GetFunctionDescriptors() const
+cdef extern from "ray/common/constants.h" nogil:
+    cdef const char[] kWorkerSetupHookKeyName
+    cdef int kResourceUnitScaling
+    cdef const char[] kImplicitResourcePrefix
+    cdef int kStreamingGeneratorReturn
+    cdef const char[] kGcsAutoscalerStateNamespace
+    cdef const char[] kGcsAutoscalerV2EnabledKey
+    cdef const char[] kGcsAutoscalerClusterConfigKey

.venv/lib/python3.11/site-packages/ray/includes/function_descriptor.pxd ADDED Viewed

	@@ -0,0 +1,80 @@

+from libc.stdint cimport uint8_t, uint64_t
+from libcpp cimport bool as c_bool
+from libcpp.memory cimport unique_ptr, shared_ptr
+from libcpp.string cimport string as c_string
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector as c_vector
+from ray.includes.common cimport (
+    CLanguage,
+)
+from ray.includes.unique_ids cimport (
+    CActorID,
+    CJobID,
+    CObjectID,
+    CTaskID,
+)
+cdef extern from "src/ray/protobuf/common.pb.h" nogil:
+    cdef cppclass CFunctionDescriptorType \
+            "ray::FunctionDescriptorType":
+        pass
+    cdef CFunctionDescriptorType EmptyFunctionDescriptorType \
+        "ray::FunctionDescriptorType::FUNCTION_DESCRIPTOR_NOT_SET"
+    cdef CFunctionDescriptorType JavaFunctionDescriptorType \
+        "ray::FunctionDescriptorType::kJavaFunctionDescriptor"
+    cdef CFunctionDescriptorType PythonFunctionDescriptorType \
+        "ray::FunctionDescriptorType::kPythonFunctionDescriptor"
+    cdef CFunctionDescriptorType CppFunctionDescriptorType \
+        "ray::FunctionDescriptorType::kCppFunctionDescriptor"
+cdef extern from "ray/common/function_descriptor.h" nogil:
+    cdef cppclass CFunctionDescriptorInterface \
+            "ray::FunctionDescriptorInterface":
+        CFunctionDescriptorType Type()
+        c_string ToString()
+        c_string Serialize()
+    ctypedef shared_ptr[CFunctionDescriptorInterface] CFunctionDescriptor \
+        "ray::FunctionDescriptor"
+    cdef cppclass CFunctionDescriptorBuilder "ray::FunctionDescriptorBuilder":
+        @staticmethod
+        CFunctionDescriptor Empty()
+        @staticmethod
+        CFunctionDescriptor BuildJava(const c_string &class_name,
+                                      const c_string &function_name,
+                                      const c_string &signature)
+        @staticmethod
+        CFunctionDescriptor BuildPython(const c_string &module_name,
+                                        const c_string &class_name,
+                                        const c_string &function_name,
+                                        const c_string &function_source_hash)
+        @staticmethod
+        CFunctionDescriptor BuildCpp(const c_string &function_name,
+                                     const c_string &caller,
+                                     const c_string &class_name)
+        @staticmethod
+        CFunctionDescriptor Deserialize(const c_string &serialized_binary)
+    cdef cppclass CJavaFunctionDescriptor "ray::JavaFunctionDescriptor":
+        c_string ClassName()
+        c_string FunctionName()
+        c_string Signature()
+    cdef cppclass CPythonFunctionDescriptor "ray::PythonFunctionDescriptor":
+        c_string ModuleName()
+        c_string ClassName()
+        c_string FunctionName()
+        c_string FunctionHash()
+    cdef cppclass CCppFunctionDescriptor "ray::CppFunctionDescriptor":
+        c_string FunctionName()
+        c_string Caller()
+        c_string ClassName()

.venv/lib/python3.11/site-packages/ray/includes/global_state_accessor.pxd ADDED Viewed

	@@ -0,0 +1,144 @@

+from libcpp.string cimport string as c_string
+from libcpp cimport bool as c_bool
+from libcpp.vector cimport vector as c_vector
+from libcpp.unordered_map cimport unordered_map
+from libcpp.memory cimport unique_ptr
+from libc.stdint cimport (
+  int32_t as c_int32_t,
+  uint32_t as c_uint32_t,
+  int64_t as c_int64_t,
+)
+from ray.includes.unique_ids cimport (
+    CActorID,
+    CJobID,
+    CNodeID,
+    CObjectID,
+    CWorkerID,
+    CPlacementGroupID,
+)
+from ray.includes.common cimport (
+    CRayStatus,
+    CGcsClientOptions,
+)
+from ray.includes.optional cimport (
+    optional
+)
+cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
+    cdef cppclass CGlobalStateAccessor "ray::gcs::GlobalStateAccessor":
+        CGlobalStateAccessor(const CGcsClientOptions&)
+        c_bool Connect()
+        void Disconnect()
+        c_vector[c_string] GetAllJobInfo(
+          c_bool skip_submission_job_info_field, c_bool skip_is_running_tasks_field)
+        CJobID GetNextJobID()
+        c_vector[c_string] GetAllNodeInfo()
+        c_vector[c_string] GetAllAvailableResources()
+        c_vector[c_string] GetAllTotalResources()
+        unordered_map[CNodeID, c_int64_t] GetDrainingNodes()
+        unique_ptr[c_string] GetInternalKV(
+          const c_string &namespace, const c_string &key)
+        c_vector[c_string] GetAllTaskEvents()
+        unique_ptr[c_string] GetObjectInfo(const CObjectID &object_id)
+        unique_ptr[c_string] GetAllResourceUsage()
+        c_vector[c_string] GetAllActorInfo(
+            optional[CActorID], optional[CJobID], optional[c_string])
+        unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
+        unique_ptr[c_string] GetWorkerInfo(const CWorkerID &worker_id)
+        c_vector[c_string] GetAllWorkerInfo()
+        c_bool AddWorkerInfo(const c_string &serialized_string)
+        c_bool UpdateWorkerDebuggerPort(const CWorkerID &worker_id,
+                                        const c_uint32_t debuger_port)
+        c_bool UpdateWorkerNumPausedThreads(const CWorkerID &worker_id,
+                                            const c_int32_t num_paused_threads_delta)
+        c_uint32_t GetWorkerDebuggerPort(const CWorkerID &worker_id)
+        unique_ptr[c_string] GetPlacementGroupInfo(
+            const CPlacementGroupID &placement_group_id)
+        unique_ptr[c_string] GetPlacementGroupByName(
+            const c_string &placement_group_name,
+            const c_string &ray_namespace,
+        )
+        c_vector[c_string] GetAllPlacementGroupInfo()
+        c_string GetSystemConfig()
+        CRayStatus GetNodeToConnectForDriver(
+            const c_string &node_ip_address,
+            c_string *node_to_connect)
+        CRayStatus GetNode(
+          const c_string &node_id_hex_str,
+          c_string *node_info)
+cdef extern from * namespace "ray::gcs" nogil:
+    """
+    #include <thread>
+    #include "ray/gcs/gcs_server/store_client_kv.h"
+    namespace ray {
+    namespace gcs {
+    bool RedisGetKeySync(const std::string& host,
+                         int32_t port,
+                         const std::string& username,
+                         const std::string& password,
+                         bool use_ssl,
+                         const std::string& config,
+                         const std::string& key,
+                         std::string* data) {
+      // Logging default value see class `RayLog`.
+      InitShutdownRAII ray_log_shutdown_raii(ray::RayLog::StartRayLog,
+                                             ray::RayLog::ShutDownRayLog,
+                                             "ray_init",
+                                             ray::RayLogLevel::WARNING,
+                                             /*log_filepath=*/"",
+                                             /*log_rotation_max_size=*/1ULL << 29,
+                                             /*log_rotation_file_num=*/10);
+      RedisClientOptions options(host, port, username, password, use_ssl);
+      std::string config_list;
+      RAY_CHECK(absl::Base64Unescape(config, &config_list));
+      RayConfig::instance().initialize(config_list);
+      instrumented_io_context io_service;
+      auto redis_client = std::make_shared<RedisClient>(options);
+      auto status = redis_client->Connect(io_service);
+      RAY_CHECK_OK(status) << "Failed to connect to redis.";
+      auto cli = std::make_unique<StoreClientInternalKV>(
+        std::make_unique<RedisStoreClient>(std::move(redis_client)));
+      bool ret_val = false;
+      cli->Get("session", key, {[&](std::optional<std::string> result) {
+        if (result.has_value()) {
+          *data = result.value();
+          ret_val = true;
+        } else {
+          RAY_LOG(INFO) << "Failed to retrieve the key " << key
+                        << " from persistent storage.";
+          ret_val = false;
+        }
+      }, io_service});
+      io_service.run_for(std::chrono::milliseconds(1000));
+      return ret_val;
+    }
+    }
+    }
+    """
+    c_bool RedisGetKeySync(const c_string& host,
+                           c_int32_t port,
+                           const c_string& username,
+                           const c_string& password,
+                           c_bool use_ssl,
+                           const c_string& config,
+                           const c_string& key,
+                           c_string* data)
+cdef extern from * namespace "ray::gcs" nogil:
+    c_bool RedisDelKeyPrefixSync(const c_string& host,
+                                 c_int32_t port,
+                                 const c_string& username,
+                                 const c_string& password,
+                                 c_bool use_ssl,
+                                 const c_string& key_prefix)

.venv/lib/python3.11/site-packages/ray/includes/libcoreworker.pxd ADDED Viewed

	@@ -0,0 +1,457 @@

+# cython: profile = False
+# distutils: language = c++
+# cython: embedsignature = True
+from libc.stdint cimport int64_t, uint64_t
+from libcpp cimport bool as c_bool
+from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.pair cimport pair as c_pair
+from libcpp.string cimport string as c_string
+from libcpp.unordered_map cimport unordered_map
+from libcpp.utility cimport pair
+from libcpp.vector cimport vector as c_vector
+from ray.includes.unique_ids cimport (
+    CActorID,
+    CClusterID,
+    CNodeID,
+    CJobID,
+    CTaskID,
+    CObjectID,
+    CPlacementGroupID,
+    CWorkerID,
+    ObjectIDIndexType,
+)
+from ray.includes.common cimport (
+    CAddress,
+    CObjectReference,
+    CActorCreationOptions,
+    CBuffer,
+    CPlacementGroupCreationOptions,
+    CObjectLocation,
+    CObjectReference,
+    CRayFunction,
+    CRayObject,
+    CRayStatus,
+    CTaskArg,
+    CTaskOptions,
+    CTaskType,
+    CWorkerType,
+    CLanguage,
+    CGcsClientOptions,
+    LocalMemoryBuffer,
+    CJobConfig,
+    CConcurrencyGroup,
+    CSchedulingStrategy,
+    CWorkerExitType,
+    CLineageReconstructionTask,
+)
+from ray.includes.function_descriptor cimport (
+    CFunctionDescriptor,
+)
+from ray.includes.optional cimport (
+    optional,
+)
+ctypedef unordered_map[c_string, c_vector[pair[int64_t, double]]] \
+    ResourceMappingType
+ctypedef void (*ray_callback_function) \
+    (shared_ptr[CRayObject] result_object,
+     CObjectID object_id, void* user_data)
+ctypedef void (*plasma_callback_function) \
+    (CObjectID object_id, int64_t data_size, int64_t metadata_size)
+# NOTE: This ctypedef is needed, because Cython doesn't compile
+# "pair[shared_ptr[const CActorHandle], CRayStatus]".
+# This is a bug of cython: https://github.com/cython/cython/issues/3967.
+ctypedef shared_ptr[const CActorHandle] ActorHandleSharedPtr
+cdef extern from "ray/core_worker/profile_event.h" nogil:
+    cdef cppclass CProfileEvent "ray::core::worker::ProfileEvent":
+        void SetExtraData(const c_string &extra_data)
+cdef extern from "ray/core_worker/fiber.h" nogil:
+    cdef cppclass CFiberEvent "ray::core::FiberEvent":
+        CFiberEvent()
+        void Wait()
+        void Notify()
+cdef extern from "ray/core_worker/experimental_mutable_object_manager.h" nogil:
+    cdef cppclass CReaderRefInfo "ray::experimental::ReaderRefInfo":
+        CReaderRefInfo()
+        CObjectID reader_ref_id
+        CActorID owner_reader_actor_id
+        int64_t num_reader_actors
+cdef extern from "ray/core_worker/context.h" nogil:
+    cdef cppclass CWorkerContext "ray::core::WorkerContext":
+        c_bool CurrentActorIsAsync()
+        const c_string &GetCurrentSerializedRuntimeEnv()
+        int CurrentActorMaxConcurrency()
+        const CActorID &GetRootDetachedActorID()
+cdef extern from "ray/core_worker/generator_waiter.h" nogil:
+    cdef cppclass CGeneratorBackpressureWaiter "ray::core::GeneratorBackpressureWaiter": # noqa
+        CGeneratorBackpressureWaiter(
+                int64_t generator_backpressure_num_objects,
+                (CRayStatus() nogil) check_signals)
+        CRayStatus WaitAllObjectsReported()
+cdef extern from "ray/core_worker/core_worker.h" nogil:
+    cdef cppclass CActorHandle "ray::core::ActorHandle":
+        CActorID GetActorID() const
+        CJobID CreationJobID() const
+        CLanguage ActorLanguage() const
+        CFunctionDescriptor ActorCreationTaskFunctionDescriptor() const
+        c_string ExtensionData() const
+        int MaxPendingCalls() const
+        int MaxTaskRetries() const
+        c_bool EnableTaskEvents() const
+    cdef cppclass CCoreWorker "ray::core::CoreWorker":
+        CWorkerType GetWorkerType()
+        CLanguage GetLanguage()
+        c_vector[CObjectReference] SubmitTask(
+            const CRayFunction &function,
+            const c_vector[unique_ptr[CTaskArg]] &args,
+            const CTaskOptions &options,
+            int max_retries,
+            c_bool retry_exceptions,
+            const CSchedulingStrategy &scheduling_strategy,
+            c_string debugger_breakpoint,
+            c_string serialized_retry_exception_allowlist,
+            c_string call_site,
+            const CTaskID current_task_id)
+        CRayStatus CreateActor(
+            const CRayFunction &function,
+            const c_vector[unique_ptr[CTaskArg]] &args,
+            const CActorCreationOptions &options,
+            const c_string &extension_data,
+            c_string call_site,
+            CActorID *actor_id)
+        CRayStatus CreatePlacementGroup(
+            const CPlacementGroupCreationOptions &options,
+            CPlacementGroupID *placement_group_id)
+        CRayStatus RemovePlacementGroup(
+            const CPlacementGroupID &placement_group_id)
+        CRayStatus WaitPlacementGroupReady(
+            const CPlacementGroupID &placement_group_id, int64_t timeout_seconds)
+        CRayStatus SubmitActorTask(
+            const CActorID &actor_id, const CRayFunction &function,
+            const c_vector[unique_ptr[CTaskArg]] &args,
+            const CTaskOptions &options,
+            int max_retries,
+            c_bool retry_exceptions,
+            c_string serialized_retry_exception_allowlist,
+            c_string call_site,
+            c_vector[CObjectReference] &task_returns,
+            const CTaskID current_task_id)
+        CRayStatus KillActor(
+            const CActorID &actor_id, c_bool force_kill,
+            c_bool no_restart)
+        CRayStatus CancelTask(const CObjectID &object_id, c_bool force_kill,
+                              c_bool recursive)
+        unique_ptr[CProfileEvent] CreateProfileEvent(
+            const c_string &event_type)
+        CRayStatus AllocateReturnObject(
+            const CObjectID &object_id,
+            const size_t &data_size,
+            const shared_ptr[CBuffer] &metadata,
+            const c_vector[CObjectID] &contained_object_id,
+            const CAddress &caller_address,
+            int64_t *task_output_inlined_bytes,
+            shared_ptr[CRayObject] *return_object)
+        CRayStatus SealReturnObject(
+            const CObjectID &return_id,
+            const shared_ptr[CRayObject] &return_object,
+            const CObjectID &generator_id,
+            const CAddress &caller_address
+        )
+        c_bool PinExistingReturnObject(
+            const CObjectID &return_id,
+            shared_ptr[CRayObject] *return_object,
+            const CObjectID &generator_id,
+            const CAddress &caller_address)
+        void AsyncDelObjectRefStream(const CObjectID &generator_id)
+        CRayStatus TryReadObjectRefStream(
+            const CObjectID &generator_id,
+            CObjectReference *object_ref_out)
+        c_bool StreamingGeneratorIsFinished(const CObjectID &generator_id) const
+        pair[CObjectReference, c_bool] PeekObjectRefStream(
+            const CObjectID &generator_id)
+        CObjectID AllocateDynamicReturnId(
+            const CAddress &owner_address,
+            const CTaskID &task_id,
+            optional[ObjectIDIndexType] put_index)
+        CJobID GetCurrentJobId()
+        CTaskID GetCurrentTaskId()
+        const c_string GetCurrentTaskName()
+        const c_string GetCurrentTaskFunctionName()
+        void UpdateTaskIsDebuggerPaused(
+            const CTaskID &task_id,
+            const c_bool is_debugger_paused)
+        int64_t GetCurrentTaskAttemptNumber()
+        CNodeID GetCurrentNodeId()
+        int64_t GetTaskDepth()
+        c_bool GetCurrentTaskRetryExceptions()
+        CPlacementGroupID GetCurrentPlacementGroupId()
+        CWorkerID GetWorkerID()
+        c_bool ShouldCaptureChildTasksInPlacementGroup()
+        const CActorID &GetActorId()
+        const c_string GetActorName()
+        void SetActorTitle(const c_string &title)
+        void SetActorReprName(const c_string &repr_name)
+        void SetWebuiDisplay(const c_string &key, const c_string &message)
+        CTaskID GetCallerId()
+        const ResourceMappingType &GetResourceIDs() const
+        void RemoveActorHandleReference(const CActorID &actor_id)
+        optional[int] GetLocalActorState(const CActorID &actor_id) const
+        CActorID DeserializeAndRegisterActorHandle(const c_string &bytes, const
+                                                   CObjectID &outer_object_id,
+                                                   c_bool add_local_ref)
+        CRayStatus SerializeActorHandle(const CActorID &actor_id, c_string
+                                        *bytes,
+                                        CObjectID *c_actor_handle_id)
+        ActorHandleSharedPtr GetActorHandle(const CActorID &actor_id) const
+        pair[ActorHandleSharedPtr, CRayStatus] GetNamedActorHandle(
+            const c_string &name, const c_string &ray_namespace)
+        pair[c_vector[c_pair[c_string, c_string]], CRayStatus] ListNamedActors(
+            c_bool all_namespaces)
+        void AddLocalReference(const CObjectID &object_id)
+        void RemoveLocalReference(const CObjectID &object_id)
+        void PutObjectIntoPlasma(const CRayObject &object,
+                                 const CObjectID &object_id)
+        const CAddress &GetRpcAddress() const
+        CRayStatus GetOwnerAddress(const CObjectID &object_id,
+                                   CAddress *owner_address) const
+        c_vector[CObjectReference] GetObjectRefs(
+                const c_vector[CObjectID] &object_ids) const
+        CRayStatus GetOwnershipInfo(const CObjectID &object_id,
+                                    CAddress *owner_address,
+                                    c_string *object_status)
+        void RegisterOwnershipInfoAndResolveFuture(
+                const CObjectID &object_id,
+                const CObjectID &outer_object_id,
+                const CAddress &owner_address,
+                const c_string &object_status)
+        CRayStatus Put(const CRayObject &object,
+                       const c_vector[CObjectID] &contained_object_ids,
+                       CObjectID *object_id)
+        CRayStatus Put(const CRayObject &object,
+                       const c_vector[CObjectID] &contained_object_ids,
+                       const CObjectID &object_id)
+        CRayStatus CreateOwnedAndIncrementLocalRef(
+                    c_bool is_mutable,
+                    const shared_ptr[CBuffer] &metadata,
+                    const size_t data_size,
+                    const c_vector[CObjectID] &contained_object_ids,
+                    CObjectID *object_id, shared_ptr[CBuffer] *data,
+                    c_bool created_by_worker,
+                    const unique_ptr[CAddress] &owner_address,
+                    c_bool inline_small_object)
+        CRayStatus CreateExisting(const shared_ptr[CBuffer] &metadata,
+                                  const size_t data_size,
+                                  const CObjectID &object_id,
+                                  const CAddress &owner_address,
+                                  shared_ptr[CBuffer] *data,
+                                  c_bool created_by_worker)
+        CRayStatus ExperimentalChannelWriteAcquire(
+                                  const CObjectID &object_id,
+                                  const shared_ptr[CBuffer] &metadata,
+                                  uint64_t data_size,
+                                  int64_t num_readers,
+                                  int64_t timeout_ms,
+                                  shared_ptr[CBuffer] *data)
+        CRayStatus ExperimentalChannelWriteRelease(
+                                  const CObjectID &object_id)
+        CRayStatus ExperimentalChannelSetError(
+                                  const CObjectID &object_id)
+        CRayStatus ExperimentalRegisterMutableObjectWriter(
+                const CObjectID &writer_object_id,
+                const c_vector[CNodeID] &remote_reader_node_ids)
+        CRayStatus ExperimentalRegisterMutableObjectReader(const CObjectID &object_id)
+        CRayStatus ExperimentalRegisterMutableObjectReaderRemote(
+                const CObjectID &object_id,
+                const c_vector[CReaderRefInfo] &remote_reader_ref_info)
+        CRayStatus SealOwned(const CObjectID &object_id, c_bool pin_object,
+                             const unique_ptr[CAddress] &owner_address)
+        CRayStatus SealExisting(const CObjectID &object_id, c_bool pin_object,
+                                const CObjectID &generator_id,
+                                const unique_ptr[CAddress] &owner_address)
+        CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms,
+                       c_vector[shared_ptr[CRayObject]] results)
+        CRayStatus GetIfLocal(
+            const c_vector[CObjectID] &ids,
+            c_vector[shared_ptr[CRayObject]] *results)
+        CRayStatus Contains(const CObjectID &object_id, c_bool *has_object,
+                            c_bool *is_in_plasma)
+        CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects,
+                        int64_t timeout_ms, c_vector[c_bool] *results,
+                        c_bool fetch_local)
+        CRayStatus Delete(const c_vector[CObjectID] &object_ids,
+                          c_bool local_only)
+        CRayStatus GetLocalObjectLocations(
+                const c_vector[CObjectID] &object_ids,
+                c_vector[optional[CObjectLocation]] *results)
+        CRayStatus GetLocationFromOwner(
+                const c_vector[CObjectID] &object_ids,
+                int64_t timeout_ms,
+                c_vector[shared_ptr[CObjectLocation]] *results)
+        CRayStatus TriggerGlobalGC()
+        CRayStatus ReportGeneratorItemReturns(
+            const pair[CObjectID, shared_ptr[CRayObject]] &dynamic_return_object,
+            const CObjectID &generator_id,
+            const CAddress &caller_address,
+            int64_t item_index,
+            uint64_t attempt_number,
+            shared_ptr[CGeneratorBackpressureWaiter] waiter)
+        c_string MemoryUsageString()
+        int GetMemoryStoreSize()
+        CWorkerContext &GetWorkerContext()
+        void YieldCurrentFiber(CFiberEvent &coroutine_done)
+        unordered_map[CObjectID, pair[size_t, size_t]] GetAllReferenceCounts()
+        c_vector[CTaskID] GetPendingChildrenTasks(const CTaskID &task_id) const
+        void GetAsync(const CObjectID &object_id,
+                      ray_callback_function success_callback,
+                      void* python_user_callback)
+        CRayStatus PushError(const CJobID &job_id, const c_string &type,
+                             const c_string &error_message, double timestamp)
+        CRayStatus SetResource(const c_string &resource_name,
+                               const double capacity,
+                               const CNodeID &client_Id)
+        CJobConfig GetJobConfig()
+        int64_t GetNumTasksSubmitted() const
+        int64_t GetNumLeasesRequested() const
+        int64_t GetLocalMemoryStoreBytesUsed() const
+        void RecordTaskLogStart(
+            const CTaskID &task_id,
+            int attempt_number,
+            const c_string& stdout_path,
+            const c_string& stderr_path,
+            int64_t stdout_start_offset,
+            int64_t stderr_start_offset) const
+        void RecordTaskLogEnd(
+            const CTaskID &task_id,
+            int attempt_number,
+            int64_t stdout_end_offset,
+            int64_t stderr_end_offset) const
+        void Exit(const CWorkerExitType exit_type,
+                  const c_string &detail,
+                  const shared_ptr[LocalMemoryBuffer] &creation_task_exception_pb_bytes)
+        unordered_map[CLineageReconstructionTask, uint64_t] \
+            GetLocalOngoingLineageReconstructionTasks() const
+    cdef cppclass CCoreWorkerOptions "ray::core::CoreWorkerOptions":
+        CWorkerType worker_type
+        CLanguage language
+        c_string store_socket
+        c_string raylet_socket
+        CJobID job_id
+        CGcsClientOptions gcs_options
+        c_bool enable_logging
+        c_string log_dir
+        c_bool install_failure_signal_handler
+        c_bool interactive
+        c_string node_ip_address
+        int node_manager_port
+        c_string raylet_ip_address
+        c_string driver_name
+        c_string stdout_file
+        c_string stderr_file
+        (CRayStatus(
+            const CAddress &caller_address,
+            CTaskType task_type,
+            const c_string name,
+            const CRayFunction &ray_function,
+            const unordered_map[c_string, double] &resources,
+            const c_vector[shared_ptr[CRayObject]] &args,
+            const c_vector[CObjectReference] &arg_refs,
+            const c_string debugger_breakpoint,
+            const c_string serialized_retry_exception_allowlist,
+            c_vector[c_pair[CObjectID, shared_ptr[CRayObject]]] *returns,
+            c_vector[c_pair[CObjectID, shared_ptr[CRayObject]]] *dynamic_returns,
+            c_vector[c_pair[CObjectID, c_bool]] *streaming_generator_returns,
+            shared_ptr[LocalMemoryBuffer]
+            &creation_task_exception_pb_bytes,
+            c_bool *is_retryable_error,
+            c_string *application_error,
+            const c_vector[CConcurrencyGroup] &defined_concurrency_groups,
+            const c_string name_of_concurrency_group_to_execute,
+            c_bool is_reattempt,
+            c_bool is_streaming_generator,
+            c_bool should_retry_exceptions,
+            int64_t generator_backpressure_num_objects
+        ) nogil) task_execution_callback
+        (void(const CWorkerID &) nogil) on_worker_shutdown
+        (CRayStatus() nogil) check_signals
+        (void(c_bool) nogil) gc_collect
+        (c_vector[c_string](
+            const c_vector[CObjectReference] &) nogil) spill_objects
+        (int64_t(
+            const c_vector[CObjectReference] &,
+            const c_vector[c_string] &) nogil) restore_spilled_objects
+        (void(
+            const c_vector[c_string]&,
+            CWorkerType) nogil) delete_spilled_objects
+        (void(
+            const c_string&,
+            const c_vector[c_string]&) nogil) run_on_util_worker_handler
+        (void(const CRayObject&) nogil) unhandled_exception_handler
+        (void(
+            const CTaskID &c_task_id,
+            const CRayFunction &ray_function,
+            const c_string c_name_of_concurrency_group_to_execute
+        ) nogil) cancel_async_task
+        (void(c_string *stack_out) nogil) get_lang_stack
+        c_bool is_local_mode
+        int num_workers
+        (c_bool(const CTaskID &) nogil) kill_main
+        CCoreWorkerOptions()
+        (void() nogil) terminate_asyncio_thread
+        c_string serialized_job_config
+        int metrics_agent_port
+        int runtime_env_hash
+        int startup_token
+        CClusterID cluster_id
+        c_string session_name
+        c_string entrypoint
+        int64_t worker_launch_time_ms
+        int64_t worker_launched_time_ms
+    cdef cppclass CCoreWorkerProcess "ray::core::CoreWorkerProcess":
+        @staticmethod
+        void Initialize(const CCoreWorkerOptions &options)
+        # Only call this in CoreWorker.__cinit__,
+        # use CoreWorker.core_worker to access C++ CoreWorker.
+        @staticmethod
+        CCoreWorker &GetCoreWorker()
+        @staticmethod
+        void Shutdown()
+        @staticmethod
+        void RunTaskExecutionLoop()

.venv/lib/python3.11/site-packages/ray/includes/metric.pxd ADDED Viewed

	@@ -0,0 +1,45 @@

+from libcpp.string cimport string as c_string
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector as c_vector
+cdef extern from "opencensus/tags/tag_key.h" nogil:
+    cdef cppclass CTagKey "opencensus::tags::TagKey":
+        @staticmethod
+        CTagKey Register(c_string &name)
+        const c_string &name() const
+cdef extern from "ray/stats/metric.h" nogil:
+    cdef cppclass CMetric "ray::stats::Metric":
+        CMetric(const c_string &name,
+                const c_string &description,
+                const c_string &unit,
+                const c_vector[c_string] &tag_keys)
+        c_string GetName() const
+        void Record(double value)
+        void Record(double value,
+                    unordered_map[c_string, c_string] &tags)
+    cdef cppclass CGauge "ray::stats::Gauge":
+        CGauge(const c_string &name,
+               const c_string &description,
+               const c_string &unit,
+               const c_vector[c_string] &tag_keys)
+    cdef cppclass CCount "ray::stats::Count":
+        CCount(const c_string &name,
+               const c_string &description,
+               const c_string &unit,
+               const c_vector[c_string] &tag_keys)
+    cdef cppclass CSum "ray::stats::Sum":
+        CSum(const c_string &name,
+             const c_string &description,
+             const c_string &unit,
+             const c_vector[c_string] &tag_keys)
+    cdef cppclass CHistogram "ray::stats::Histogram":
+        CHistogram(const c_string &name,
+                   const c_string &description,
+                   const c_string &unit,
+                   const c_vector[double] &boundaries,
+                   const c_vector[c_string] &tag_keys)

.venv/lib/python3.11/site-packages/ray/includes/optional.pxd ADDED Viewed

	@@ -0,0 +1,36 @@

+# Currently Cython does not support std::optional.
+# See: https://github.com/cython/cython/pull/3294
+from libcpp cimport bool
+cdef extern from "<optional>" namespace "std" nogil:
+    cdef cppclass nullopt_t:
+        nullopt_t()
+    cdef nullopt_t nullopt
+    cdef cppclass optional[T]:
+        ctypedef T value_type
+        optional()
+        optional(nullopt_t)
+        optional(optional&) except +
+        optional(T&) except +
+        bool has_value()
+        T& value()
+        T& value_or[U](U& default_value)
+        void swap(optional&)
+        void reset()
+        T& emplace(...)
+        T& operator*()
+        # T* operator->() # Not Supported
+        optional& operator=(optional&)
+        optional& operator=[U](U&)
+        bool operator bool()
+        bool operator!()
+        bool operator==[U](optional&, U&)
+        bool operator!=[U](optional&, U&)
+        bool operator<[U](optional&, U&)
+        bool operator>[U](optional&, U&)
+        bool operator<=[U](optional&, U&)
+        bool operator>=[U](optional&, U&)
+    optional[T] make_optional[T](...) except +

.venv/lib/python3.11/site-packages/ray/includes/ray_config.pxd ADDED Viewed

	@@ -0,0 +1,98 @@

+from libcpp cimport bool as c_bool
+from libc.stdint cimport int64_t, uint64_t, uint32_t
+from libcpp.string cimport string as c_string
+from libcpp.unordered_map cimport unordered_map
+cdef extern from "ray/common/ray_config.h" nogil:
+    cdef cppclass RayConfig "RayConfig":
+        @staticmethod
+        RayConfig &instance()
+        void initialize(const c_string& config_list)
+        int64_t ray_cookie() const
+        int64_t handler_warning_timeout_ms() const
+        int64_t debug_dump_period_milliseconds() const
+        int64_t object_timeout_milliseconds() const
+        int64_t raylet_client_num_connect_attempts() const
+        int64_t raylet_client_connect_timeout_milliseconds() const
+        int64_t raylet_fetch_timeout_milliseconds() const
+        int64_t kill_worker_timeout_milliseconds() const
+        int64_t worker_register_timeout_seconds() const
+        int64_t redis_db_connect_retries()
+        int64_t redis_db_connect_wait_milliseconds() const
+        int object_manager_pull_timeout_ms() const
+        int object_manager_push_timeout_ms() const
+        uint64_t object_manager_default_chunk_size() const
+        uint32_t maximum_gcs_deletion_batch_size() const
+        int64_t max_direct_call_object_size() const
+        int64_t task_rpc_inlined_bytes_limit() const
+        uint64_t metrics_report_interval_ms() const
+        c_bool enable_timeline() const
+        uint32_t max_grpc_message_size() const
+        c_bool record_ref_creation_sites() const
+        c_string REDIS_CA_CERT() const
+        c_string REDIS_CA_PATH() const
+        c_string REDIS_CLIENT_CERT() const
+        c_string REDIS_CLIENT_KEY() const
+        c_string REDIS_SERVER_NAME() const
+        int64_t health_check_initial_delay_ms() const
+        int64_t health_check_period_ms() const
+        int64_t health_check_timeout_ms() const
+        int64_t health_check_failure_threshold() const
+        uint64_t memory_monitor_refresh_ms() const
+        int64_t grpc_keepalive_time_ms() const
+        int64_t grpc_keepalive_timeout_ms() const
+        int64_t grpc_client_keepalive_time_ms() const
+        int64_t grpc_client_keepalive_timeout_ms() const
+        c_bool enable_autoscaler_v2() const
+        c_string predefined_unit_instance_resources() const
+        c_string custom_unit_instance_resources() const
+        int64_t nums_py_gcs_reconnect_retry() const
+        int64_t py_gcs_connect_timeout_s() const
+        int gcs_rpc_server_reconnect_timeout_s() const
+        int maximum_gcs_destroyed_actor_cached_count() const
+        c_bool record_task_actor_creation_sites() const

.venv/lib/python3.11/site-packages/ray/includes/unique_ids.pxd ADDED Viewed

	@@ -0,0 +1,218 @@

+from libcpp cimport bool as c_bool
+from libcpp.string cimport string as c_string
+from libc.stdint cimport uint8_t, uint32_t, int64_t
+cdef extern from "ray/common/id.h" namespace "ray" nogil:
+    cdef cppclass CBaseID[T]:
+        @staticmethod
+        T FromBinary(const c_string &binary)
+        @staticmethod
+        T FromHex(const c_string &hex_str)
+        @staticmethod
+        const T Nil()
+        @staticmethod
+        size_t Size()
+        size_t Hash() const
+        c_bool IsNil() const
+        c_bool operator==(const CBaseID &rhs) const
+        c_bool operator!=(const CBaseID &rhs) const
+        const uint8_t *data() const
+        c_string Binary() const
+        c_string Hex() const
+    cdef cppclass CUniqueID "ray::UniqueID"(CBaseID):
+        CUniqueID()
+        @staticmethod
+        size_t Size()
+        @staticmethod
+        CUniqueID FromRandom()
+        @staticmethod
+        CUniqueID FromBinary(const c_string &binary)
+        @staticmethod
+        const CUniqueID Nil()
+        @staticmethod
+        size_t Size()
+    cdef cppclass CActorClassID "ray::ActorClassID"(CUniqueID):
+        @staticmethod
+        CActorClassID FromBinary(const c_string &binary)
+        @staticmethod
+        CActorClassID FromHex(const c_string &hex_str)
+    cdef cppclass CActorID "ray::ActorID"(CBaseID[CActorID]):
+        @staticmethod
+        CActorID FromBinary(const c_string &binary)
+        @staticmethod
+        CActorID FromHex(const c_string &hex_str)
+        @staticmethod
+        const CActorID Nil()
+        @staticmethod
+        size_t Size()
+        @staticmethod
+        CActorID Of(CJobID job_id, CTaskID parent_task_id,
+                    int64_t parent_task_counter)
+        CJobID JobId()
+    cdef cppclass CNodeID "ray::NodeID"(CUniqueID):
+        @staticmethod
+        CNodeID FromBinary(const c_string &binary)
+        @staticmethod
+        CNodeID FromHex(const c_string &hex_str)
+        @staticmethod
+        const CNodeID Nil()
+    cdef cppclass CConfigID "ray::ConfigID"(CUniqueID):
+        @staticmethod
+        CConfigID FromBinary(const c_string &binary)
+    cdef cppclass CFunctionID "ray::FunctionID"(CUniqueID):
+        @staticmethod
+        CFunctionID FromBinary(const c_string &binary)
+        @staticmethod
+        CFunctionID FromHex(const c_string &hex_str)
+    cdef cppclass CJobID "ray::JobID"(CBaseID[CJobID]):
+        @staticmethod
+        CJobID FromBinary(const c_string &binary)
+        @staticmethod
+        CJobID FromHex(const c_string &hex_str)
+        @staticmethod
+        const CJobID Nil()
+        @staticmethod
+        size_t Size()
+        @staticmethod
+        CJobID FromInt(uint32_t value)
+        uint32_t ToInt()
+    cdef cppclass CTaskID "ray::TaskID"(CBaseID[CTaskID]):
+        @staticmethod
+        CTaskID FromBinary(const c_string &binary)
+        @staticmethod
+        CTaskID FromHex(const c_string &hex_str)
+        @staticmethod
+        const CTaskID Nil()
+        @staticmethod
+        size_t Size()
+        @staticmethod
+        CTaskID ForDriverTask(const CJobID &job_id)
+        @staticmethod
+        CTaskID FromRandom(const CJobID &job_id)
+        @staticmethod
+        CTaskID ForActorCreationTask(CActorID actor_id)
+        @staticmethod
+        CTaskID ForActorTask(CJobID job_id, CTaskID parent_task_id,
+                             int64_t parent_task_counter, CActorID actor_id)
+        @staticmethod
+        CTaskID ForNormalTask(CJobID job_id, CTaskID parent_task_id,
+                              int64_t parent_task_counter)
+        CActorID ActorId() const
+        CJobID JobId() const
+    cdef cppclass CObjectID" ray::ObjectID"(CBaseID[CObjectID]):
+        @staticmethod
+        int64_t MaxObjectIndex()
+        @staticmethod
+        CObjectID FromBinary(const c_string &binary)
+        @staticmethod
+        CObjectID FromRandom()
+        @staticmethod
+        const CObjectID Nil()
+        @staticmethod
+        CObjectID FromIndex(const CTaskID &task_id, int64_t index)
+        @staticmethod
+        size_t Size()
+        c_bool is_put()
+        int64_t ObjectIndex() const
+        CTaskID TaskId() const
+    cdef cppclass CClusterID "ray::ClusterID"(CUniqueID):
+        @staticmethod
+        CClusterID FromBinary(const c_string &binary)
+        @staticmethod
+        CClusterID FromHex(const c_string &hex_str)
+        @staticmethod
+        CClusterID FromRandom()
+        @staticmethod
+        const CClusterID Nil()
+    cdef cppclass CWorkerID "ray::WorkerID"(CUniqueID):
+        @staticmethod
+        CWorkerID FromBinary(const c_string &binary)
+        @staticmethod
+        CWorkerID FromHex(const c_string &hex_str)
+    cdef cppclass CPlacementGroupID "ray::PlacementGroupID" \
+                                    (CBaseID[CPlacementGroupID]):
+        @staticmethod
+        CPlacementGroupID FromBinary(const c_string &binary)
+        @staticmethod
+        CPlacementGroupID FromHex(const c_string &hex_str)
+        @staticmethod
+        const CPlacementGroupID Nil()
+        @staticmethod
+        size_t Size()
+        @staticmethod
+        CPlacementGroupID Of(CJobID job_id)
+    ctypedef uint32_t ObjectIDIndexType

.venv/lib/python3.11/site-packages/ray/runtime_env/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from ray._private.runtime_env.mpi import mpi_init  # noqa: E402,F401
+from ray.runtime_env.runtime_env import RuntimeEnv, RuntimeEnvConfig  # noqa: E402,F401
+__all__ = [
+    "RuntimeEnvConfig",
+    "RuntimeEnv",
+    "mpi_init",
+]

.venv/lib/python3.11/site-packages/ray/runtime_env/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (423 Bytes). View file

.venv/lib/python3.11/site-packages/ray/runtime_env/__pycache__/runtime_env.cpython-311.pyc ADDED Viewed

Binary file (31.9 kB). View file

.venv/lib/python3.11/site-packages/ray/runtime_env/runtime_env.py ADDED Viewed

	@@ -0,0 +1,662 @@

+import json
+import logging
+import os
+from copy import deepcopy
+from dataclasses import asdict, is_dataclass
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+import ray
+from ray._private.ray_constants import DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS
+from ray._private.runtime_env.conda import get_uri as get_conda_uri
+from ray._private.runtime_env.default_impl import get_image_uri_plugin_cls
+from ray._private.runtime_env.pip import get_uri as get_pip_uri
+from ray._private.runtime_env.plugin_schema_manager import RuntimeEnvPluginSchemaManager
+from ray._private.runtime_env.uv import get_uri as get_uv_uri
+from ray._private.runtime_env.validation import OPTION_TO_VALIDATION_FN
+from ray._private.thirdparty.dacite import from_dict
+from ray.core.generated.runtime_env_common_pb2 import (
+    RuntimeEnvConfig as ProtoRuntimeEnvConfig,
+)
+from ray.util.annotations import PublicAPI
+logger = logging.getLogger(__name__)
+@PublicAPI(stability="stable")
+class RuntimeEnvConfig(dict):
+    """Used to specify configuration options for a runtime environment.
+    The config is not included when calculating the runtime_env hash,
+    which means that two runtime_envs with the same options but different
+    configs are considered the same for caching purposes.
+    Args:
+        setup_timeout_seconds: The timeout of runtime environment
+            creation, timeout is in seconds. The value `-1` means disable
+            timeout logic, except `-1`, `setup_timeout_seconds` cannot be
+            less than or equal to 0. The default value of `setup_timeout_seconds`
+            is 600 seconds.
+        eager_install: Indicates whether to install the runtime environment
+            on the cluster at `ray.init()` time, before the workers are leased.
+            This flag is set to `True` by default.
+    """
+    known_fields: Set[str] = {"setup_timeout_seconds", "eager_install", "log_files"}
+    _default_config: Dict = {
+        "setup_timeout_seconds": DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS,
+        "eager_install": True,
+        "log_files": [],
+    }
+    def __init__(
+        self,
+        setup_timeout_seconds: int = DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS,
+        eager_install: bool = True,
+        log_files: Optional[List[str]] = None,
+    ):
+        super().__init__()
+        if not isinstance(setup_timeout_seconds, int):
+            raise TypeError(
+                "setup_timeout_seconds must be of type int, "
+                f"got: {type(setup_timeout_seconds)}"
+            )
+        elif setup_timeout_seconds <= 0 and setup_timeout_seconds != -1:
+            raise ValueError(
+                "setup_timeout_seconds must be greater than zero "
+                f"or equals to -1, got: {setup_timeout_seconds}"
+            )
+        self["setup_timeout_seconds"] = setup_timeout_seconds
+        if not isinstance(eager_install, bool):
+            raise TypeError(
+                f"eager_install must be a boolean. got {type(eager_install)}"
+            )
+        self["eager_install"] = eager_install
+        if log_files is not None:
+            if not isinstance(log_files, list):
+                raise TypeError(
+                    "log_files must be a list of strings or None, got "
+                    f"{log_files} with type {type(log_files)}."
+                )
+            for file_name in log_files:
+                if not isinstance(file_name, str):
+                    raise TypeError("Each item in log_files must be a string.")
+        else:
+            log_files = self._default_config["log_files"]
+        self["log_files"] = log_files
+    @staticmethod
+    def parse_and_validate_runtime_env_config(
+        config: Union[Dict, "RuntimeEnvConfig"]
+    ) -> "RuntimeEnvConfig":
+        if isinstance(config, RuntimeEnvConfig):
+            return config
+        elif isinstance(config, Dict):
+            unknown_fields = set(config.keys()) - RuntimeEnvConfig.known_fields
+            if len(unknown_fields):
+                logger.warning(
+                    "The following unknown entries in the runtime_env_config "
+                    f"dictionary will be ignored: {unknown_fields}."
+                )
+            config_dict = dict()
+            for field in RuntimeEnvConfig.known_fields:
+                if field in config:
+                    config_dict[field] = config[field]
+            return RuntimeEnvConfig(**config_dict)
+        else:
+            raise TypeError(
+                "runtime_env['config'] must be of type dict or RuntimeEnvConfig, "
+                f"got: {type(config)}"
+            )
+    @classmethod
+    def default_config(cls):
+        return RuntimeEnvConfig(**cls._default_config)
+    def build_proto_runtime_env_config(self) -> ProtoRuntimeEnvConfig:
+        runtime_env_config = ProtoRuntimeEnvConfig()
+        runtime_env_config.setup_timeout_seconds = self["setup_timeout_seconds"]
+        runtime_env_config.eager_install = self["eager_install"]
+        if self["log_files"] is not None:
+            runtime_env_config.log_files.extend(self["log_files"])
+        return runtime_env_config
+    @classmethod
+    def from_proto(cls, runtime_env_config: ProtoRuntimeEnvConfig):
+        setup_timeout_seconds = runtime_env_config.setup_timeout_seconds
+        # Cause python class RuntimeEnvConfig has validate to avoid
+        # setup_timeout_seconds equals zero, so setup_timeout_seconds
+        # on RuntimeEnvConfig is zero means other Language(except python)
+        # dosn't assign value to setup_timeout_seconds. So runtime_env_agent
+        # assign the default value to setup_timeout_seconds.
+        if setup_timeout_seconds == 0:
+            setup_timeout_seconds = cls._default_config["setup_timeout_seconds"]
+        return cls(
+            setup_timeout_seconds=setup_timeout_seconds,
+            eager_install=runtime_env_config.eager_install,
+            log_files=list(runtime_env_config.log_files),
+        )
+    def to_dict(self) -> Dict:
+        return dict(deepcopy(self))
+# Due to circular reference, field config can only be assigned a value here
+OPTION_TO_VALIDATION_FN[
+    "config"
+] = RuntimeEnvConfig.parse_and_validate_runtime_env_config
+@PublicAPI
+class RuntimeEnv(dict):
+    """This class is used to define a runtime environment for a job, task,
+    or actor.
+    See :ref:`runtime-environments` for detailed documentation.
+    This class can be used interchangeably with an unstructured dictionary
+    in the relevant API calls.
+    Can specify a runtime environment whole job, whether running a script
+    directly on the cluster, using Ray Job submission, or using Ray Client:
+    .. code-block:: python
+        from ray.runtime_env import RuntimeEnv
+        # Starting a single-node local Ray cluster
+        ray.init(runtime_env=RuntimeEnv(...))
+    .. code-block:: python
+        from ray.runtime_env import RuntimeEnv
+        # Connecting to remote cluster using Ray Client
+        ray.init("ray://123.456.7.89:10001", runtime_env=RuntimeEnv(...))
+    Can specify different runtime environments per-actor or per-task using
+    ``.options()`` or the ``@ray.remote`` decorator:
+    .. code-block:: python
+        from ray.runtime_env import RuntimeEnv
+        # Invoke a remote task that runs in a specified runtime environment.
+        f.options(runtime_env=RuntimeEnv(...)).remote()
+        # Instantiate an actor that runs in a specified runtime environment.
+        actor = SomeClass.options(runtime_env=RuntimeEnv(...)).remote()
+        # Specify a runtime environment in the task definition. Future invocations via
+        # `g.remote()` use this runtime environment unless overridden by using
+        # `.options()` as above.
+        @ray.remote(runtime_env=RuntimeEnv(...))
+        def g():
+            pass
+        # Specify a runtime environment in the actor definition. Future instantiations
+        # via `MyClass.remote()` use this runtime environment unless overridden by
+        # using `.options()` as above.
+        @ray.remote(runtime_env=RuntimeEnv(...))
+        class MyClass:
+            pass
+    Here are some examples of RuntimeEnv initialization:
+    .. code-block:: python
+        # Example for using conda
+        RuntimeEnv(conda={
+            "channels": ["defaults"], "dependencies": ["codecov"]})
+        RuntimeEnv(conda="pytorch_p36")   # Found on DLAMIs
+        # Example for using container
+        RuntimeEnv(
+            container={"image": "anyscale/ray-ml:nightly-py38-cpu",
+            "run_options": ["--cap-drop SYS_ADMIN","--log-level=debug"]})
+        # Example for set env_vars
+        RuntimeEnv(env_vars={"OMP_NUM_THREADS": "32", "TF_WARNINGS": "none"})
+        # Example for set pip
+        RuntimeEnv(
+            pip={"packages":["tensorflow", "requests"], "pip_check": False,
+            "pip_version": "==22.0.2;python_version=='3.8.11'"})
+        # Example for using image_uri
+        RuntimeEnv(
+            image_uri="rayproject/ray:2.39.0-py312-cu123")
+    Args:
+        py_modules: List of URIs (either in the GCS or external
+            storage), each of which is a zip file that Ray unpacks and
+            inserts into the PYTHONPATH of the workers.
+        working_dir: URI (either in the GCS or external storage) of a zip
+            file that Ray unpacks in the directory of each task/actor.
+        pip: Either a list of pip packages, a string
+            containing the path to a pip requirements.txt file, or a Python
+            dictionary that has three fields: 1) ``packages`` (required, List[str]): a
+            list of pip packages, 2) ``pip_check`` (optional, bool): whether enable
+            pip check at the end of pip install, defaults to False.
+            3) ``pip_version`` (optional, str): the version of pip, Ray prepends
+            the package name "pip" in front of the ``pip_version`` to form the final
+            requirement string, the syntax of a requirement specifier is defined in
+            full in PEP 508.
+        uv: Either a list of pip packages, or a Python dictionary that has one field:
+            1) ``packages`` (required, List[str]).
+        conda: Either the conda YAML config, the name of a
+            local conda env (e.g., "pytorch_p36"), or the path to a conda
+            environment.yaml file.
+            Ray automatically injects the dependency into the conda
+            env to ensure compatibility with the cluster Ray. Ray may automatically
+            mangle the conda name to avoid conflicts between runtime envs.
+            This field can't be specified at the same time as the 'pip' field.
+            To use pip with conda, specify your pip dependencies within
+            the conda YAML config:
+            https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#create-env-file-manually
+        container: Require a given (Docker) container image,
+            The Ray worker process runs in a container with this image.
+            This parameter only works alone, or with the ``config`` or
+            ``env_vars`` parameters.
+            The `run_options` list spec is here:
+            https://docs.docker.com/engine/reference/run/
+        env_vars: Environment variables to set.
+        worker_process_setup_hook: (Experimental) The setup hook that's
+            called after workers start and before Tasks and Actors are scheduled.
+            A module name (string type) or callable (function) can be passed.
+            When a module name is passed, Ray worker should be able to access the
+            module name. When a callable is passed, callable should be serializable.
+            When a runtime env is specified by job submission API,
+            only a module name (string) is allowed.
+        nsight: Dictionary mapping nsight profile option name to it's value.
+        config: config for runtime environment. Either
+            a dict or a RuntimeEnvConfig. Field: (1) setup_timeout_seconds, the
+            timeout of runtime environment creation,  timeout is in seconds.
+        image_uri: URI to a container image. The Ray worker process runs
+            in a container with this image. This parameter only works alone,
+            or with the ``config`` or ``env_vars`` parameters.
+    """
+    known_fields: Set[str] = {
+        "py_modules",
+        "java_jars",
+        "working_dir",
+        "conda",
+        "pip",
+        "uv",
+        "container",
+        "excludes",
+        "env_vars",
+        "_ray_release",
+        "_ray_commit",
+        "_inject_current_ray",
+        "config",
+        # TODO(SongGuyang): We add this because the test
+        # `test_experimental_package_github` set a `docker`
+        # field which is not supported. We should remove it
+        # with the test.
+        "docker",
+        "worker_process_setup_hook",
+        "_nsight",
+        "mpi",
+        "image_uri",
+    }
+    extensions_fields: Set[str] = {
+        "_ray_release",
+        "_ray_commit",
+        "_inject_current_ray",
+    }
+    def __init__(
+        self,
+        *,
+        py_modules: Optional[List[str]] = None,
+        working_dir: Optional[str] = None,
+        pip: Optional[List[str]] = None,
+        conda: Optional[Union[Dict[str, str], str]] = None,
+        container: Optional[Dict[str, str]] = None,
+        env_vars: Optional[Dict[str, str]] = None,
+        worker_process_setup_hook: Optional[Union[Callable, str]] = None,
+        nsight: Optional[Union[str, Dict[str, str]]] = None,
+        config: Optional[Union[Dict, RuntimeEnvConfig]] = None,
+        _validate: bool = True,
+        mpi: Optional[Dict] = None,
+        image_uri: Optional[str] = None,
+        uv: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        runtime_env = kwargs
+        if py_modules is not None:
+            runtime_env["py_modules"] = py_modules
+        if working_dir is not None:
+            runtime_env["working_dir"] = working_dir
+        if pip is not None:
+            runtime_env["pip"] = pip
+        if uv is not None:
+            runtime_env["uv"] = uv
+        if conda is not None:
+            runtime_env["conda"] = conda
+        if nsight is not None:
+            runtime_env["_nsight"] = nsight
+        if container is not None:
+            runtime_env["container"] = container
+        if env_vars is not None:
+            runtime_env["env_vars"] = env_vars
+        if config is not None:
+            runtime_env["config"] = config
+        if worker_process_setup_hook is not None:
+            runtime_env["worker_process_setup_hook"] = worker_process_setup_hook
+        if mpi is not None:
+            runtime_env["mpi"] = mpi
+        if image_uri is not None:
+            runtime_env["image_uri"] = image_uri
+        if runtime_env.get("java_jars"):
+            runtime_env["java_jars"] = runtime_env.get("java_jars")
+        self.update(runtime_env)
+        # Blindly trust that the runtime_env has already been validated.
+        # This is dangerous and should only be used internally (e.g., on the
+        # deserialization codepath.
+        if not _validate:
+            return
+        if (self.get("conda") is not None) + (self.get("pip") is not None) + (
+            self.get("uv") is not None
+        ) > 1:
+            raise ValueError(
+                "The 'pip' field, 'uv' field, and 'conda' field of "
+                "runtime_env cannot be specified at the same time.\n"
+                f"specified pip field: {self.get('pip')}\n"
+                f"specified conda field: {self.get('conda')}\n"
+                f"specified uv field: {self.get('uv')}\n"
+                "To use pip with conda, please only set the 'conda'"
+                "field, and specify your pip dependencies within the conda YAML "
+                "config dict: see https://conda.io/projects/conda/en/latest/"
+                "user-guide/tasks/manage-environments.html"
+                "#create-env-file-manually"
+            )
+        if self.get("container"):
+            invalid_keys = set(runtime_env.keys()) - {"container", "config", "env_vars"}
+            if len(invalid_keys):
+                raise ValueError(
+                    "The 'container' field currently cannot be used "
+                    "together with other fields of runtime_env. "
+                    f"Specified fields: {invalid_keys}"
+                )
+            logger.warning(
+                "The `container` runtime environment field is DEPRECATED and will be "
+                "removed after July 31, 2025. Use `image_uri` instead. See "
+                "https://docs.ray.io/en/latest/serve/advanced-guides/multi-app-container.html."  # noqa
+            )
+        if self.get("image_uri"):
+            image_uri_plugin_cls = get_image_uri_plugin_cls()
+            invalid_keys = (
+                set(runtime_env.keys()) - image_uri_plugin_cls.get_compatible_keys()
+            )
+            if len(invalid_keys):
+                raise ValueError(
+                    "The 'image_uri' field currently cannot be used "
+                    "together with other fields of runtime_env. "
+                    f"Specified fields: {invalid_keys}"
+                )
+        for option, validate_fn in OPTION_TO_VALIDATION_FN.items():
+            option_val = self.get(option)
+            if option_val is not None:
+                del self[option]
+                self[option] = option_val
+        if "_ray_commit" not in self:
+            if self.get("pip") or self.get("conda"):
+                self["_ray_commit"] = ray.__commit__
+        # Used for testing wheels that have not yet been merged into master.
+        # If this is set to True, then we do not inject Ray into the conda
+        # or pip dependencies.
+        if "_inject_current_ray" not in self:
+            if "RAY_RUNTIME_ENV_LOCAL_DEV_MODE" in os.environ:
+                self["_inject_current_ray"] = True
+        # NOTE(architkulkarni): This allows worker caching code in C++ to check
+        # if a runtime env is empty without deserializing it.  This is a catch-
+        # all; for validated inputs we won't set the key if the value is None.
+        if all(val is None for val in self.values()):
+            self.clear()
+    def __setitem__(self, key: str, value: Any) -> None:
+        if is_dataclass(value):
+            jsonable_type = asdict(value)
+        else:
+            jsonable_type = value
+        RuntimeEnvPluginSchemaManager.validate(key, jsonable_type)
+        res_value = jsonable_type
+        if key in RuntimeEnv.known_fields and key in OPTION_TO_VALIDATION_FN:
+            res_value = OPTION_TO_VALIDATION_FN[key](jsonable_type)
+            if res_value is None:
+                return
+        return super().__setitem__(key, res_value)
+    def set(self, name: str, value: Any) -> None:
+        self.__setitem__(name, value)
+    def get(self, name, default=None, data_class=None):
+        if name not in self:
+            return default
+        if not data_class:
+            return self.__getitem__(name)
+        else:
+            return from_dict(data_class=data_class, data=self.__getitem__(name))
+    @classmethod
+    def deserialize(cls, serialized_runtime_env: str) -> "RuntimeEnv":  # noqa: F821
+        return cls(_validate=False, **json.loads(serialized_runtime_env))
+    def serialize(self) -> str:
+        # To ensure the accuracy of Proto, `__setitem__` can only guarantee the
+        # accuracy of a certain field, not the overall accuracy
+        runtime_env = type(self)(_validate=True, **self)
+        return json.dumps(
+            runtime_env,
+            sort_keys=True,
+        )
+    def to_dict(self) -> Dict:
+        runtime_env_dict = dict(deepcopy(self))
+        # Replace strongly-typed RuntimeEnvConfig with a dict to allow the returned
+        # dict to work properly as a field in a dataclass. Details in issue #26986
+        if runtime_env_dict.get("config"):
+            runtime_env_dict["config"] = runtime_env_dict["config"].to_dict()
+        return runtime_env_dict
+    def has_working_dir(self) -> bool:
+        return self.get("working_dir") is not None
+    def working_dir_uri(self) -> Optional[str]:
+        return self.get("working_dir")
+    def py_modules_uris(self) -> List[str]:
+        if "py_modules" in self:
+            return list(self["py_modules"])
+        return []
+    def conda_uri(self) -> Optional[str]:
+        if "conda" in self:
+            return get_conda_uri(self)
+        return None
+    def pip_uri(self) -> Optional[str]:
+        if "pip" in self:
+            return get_pip_uri(self)
+        return None
+    def uv_uri(self) -> Optional[str]:
+        if "uv" in self:
+            return get_uv_uri(self)
+        return None
+    def plugin_uris(self) -> List[str]:
+        """Not implemented yet, always return a empty list"""
+        return []
+    def working_dir(self) -> str:
+        return self.get("working_dir", "")
+    def py_modules(self) -> List[str]:
+        if "py_modules" in self:
+            return list(self["py_modules"])
+        return []
+    def java_jars(self) -> List[str]:
+        if "java_jars" in self:
+            return list(self["java_jars"])
+        return []
+    def mpi(self) -> Optional[Union[str, Dict[str, str]]]:
+        return self.get("mpi", None)
+    def nsight(self) -> Optional[Union[str, Dict[str, str]]]:
+        return self.get("_nsight", None)
+    def env_vars(self) -> Dict:
+        return self.get("env_vars", {})
+    def has_conda(self) -> str:
+        if self.get("conda"):
+            return True
+        return False
+    def conda_env_name(self) -> str:
+        if not self.has_conda() or not isinstance(self["conda"], str):
+            return None
+        return self["conda"]
+    def conda_config(self) -> str:
+        if not self.has_conda() or not isinstance(self["conda"], dict):
+            return None
+        return json.dumps(self["conda"], sort_keys=True)
+    def has_pip(self) -> bool:
+        if self.get("pip"):
+            return True
+        return False
+    def has_uv(self) -> bool:
+        if self.get("uv"):
+            return True
+        return False
+    def virtualenv_name(self) -> Optional[str]:
+        if not self.has_pip() or not isinstance(self["pip"], str):
+            return None
+        return self["pip"]
+    def pip_config(self) -> Dict:
+        if not self.has_pip() or isinstance(self["pip"], str):
+            return {}
+        # Parse and validate field pip on method `__setitem__`
+        self["pip"] = self["pip"]
+        return self["pip"]
+    def uv_config(self) -> Dict:
+        if not self.has_uv() or isinstance(self["uv"], str):
+            return {}
+        # Parse and validate field pip on method `__setitem__`
+        self["uv"] = self["uv"]
+        return self["uv"]
+    def get_extension(self, key) -> Optional[str]:
+        if key not in RuntimeEnv.extensions_fields:
+            raise ValueError(
+                f"Extension key must be one of {RuntimeEnv.extensions_fields}, "
+                f"got: {key}"
+            )
+        return self.get(key)
+    def has_py_container(self) -> bool:
+        if self.get("container"):
+            return True
+        return False
+    def py_container_image(self) -> Optional[str]:
+        if not self.has_py_container():
+            return None
+        return self["container"].get("image", "")
+    def py_container_worker_path(self) -> Optional[str]:
+        if not self.has_py_container():
+            return None
+        return self["container"].get("worker_path", "")
+    def py_container_run_options(self) -> List:
+        if not self.has_py_container():
+            return None
+        return self["container"].get("run_options", [])
+    def image_uri(self) -> Optional[str]:
+        return self.get("image_uri")
+    def plugins(self) -> List[Tuple[str, Any]]:
+        result = list()
+        for key, value in self.items():
+            if key not in self.known_fields:
+                result.append((key, value))
+        return result
+def _merge_runtime_env(
+    parent: Optional[RuntimeEnv],
+    child: Optional[RuntimeEnv],
+    override: bool = False,
+) -> Optional[RuntimeEnv]:
+    """Merge the parent and child runtime environments.
+    If override = True, the child's runtime env overrides the parent's
+    runtime env in the event of a conflict.
+    Merging happens per key (i.e., "conda", "pip", ...), but
+    "env_vars" are merged per env var key.
+    It returns None if Ray fails to merge runtime environments because
+    of a conflict and `override = False`.
+    Args:
+        parent: Parent runtime env.
+        child: Child runtime env.
+        override: If True, the child's runtime env overrides
+            conflicting fields.
+    Returns:
+        The merged runtime env's if Ray successfully merges them.
+        None if the runtime env's conflict. Empty dict if
+        parent and child are both None.
+    """
+    if parent is None:
+        parent = {}
+    if child is None:
+        child = {}
+    parent = deepcopy(parent)
+    child = deepcopy(child)
+    parent_env_vars = parent.pop("env_vars", {})
+    child_env_vars = child.pop("env_vars", {})
+    if not override:
+        if set(parent.keys()).intersection(set(child.keys())):
+            return None
+        if set(parent_env_vars.keys()).intersection(set(child_env_vars.keys())):  # noqa
+            return None
+    parent.update(child)
+    parent_env_vars.update(child_env_vars)
+    if parent_env_vars:
+        parent["env_vars"] = parent_env_vars
+    return parent

.venv/lib/python3.11/site-packages/ray/widgets/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from ray.widgets.render import Template
+from ray.widgets.util import make_table_html_repr
+__all__ = ["Template", "make_table_html_repr"]

.venv/lib/python3.11/site-packages/ray/widgets/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (362 Bytes). View file

.venv/lib/python3.11/site-packages/ray/widgets/__pycache__/render.cpython-311.pyc ADDED Viewed

Binary file (2.69 kB). View file

.venv/lib/python3.11/site-packages/ray/widgets/__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (9.29 kB). View file

.venv/lib/python3.11/site-packages/ray/widgets/render.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import pathlib
+from typing import List
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+class Template:
+    """Class which provides basic HTML templating."""
+    def __init__(self, file: str):
+        with open(pathlib.Path(__file__).parent / "templates" / file, "r") as f:
+            self.template = f.read()
+    def render(self, **kwargs) -> str:
+        """Render an HTML template with the given data.
+        This is done by replacing instances of `{{ key }}` with `value`
+        from the keyword arguments.
+        Returns:
+            HTML template with the keys of the kwargs replaced with corresponding
+                values.
+        """
+        rendered = self.template
+        for key, value in kwargs.items():
+            if isinstance(value, List):
+                value = "".join(value)
+            rendered = rendered.replace("{{ " + key + " }}", value if value else "")
+        return rendered
+    @staticmethod
+    def list_templates() -> List[pathlib.Path]:
+        """List the available HTML templates.
+        Returns:
+            A list of files with .html.j2 extensions inside ../templates/
+        """
+        return (pathlib.Path(__file__).parent / "templates").glob("*.html.j2")

.venv/lib/python3.11/site-packages/ray/widgets/templates/context.html.j2 ADDED Viewed

	@@ -0,0 +1,6 @@

+<div class="lm-Widget p-Widget lm-Panel p-Panel jp-Cell-outputWrapper">
+    <div style="margin-left: 50px;display: flex;flex-direction: row;align-items: center">
+        {{ context_logo }}
+        {{ context_table }}
+    </div>
+</div>

.venv/lib/python3.11/site-packages/ray/widgets/templates/context_dashrow.html.j2 ADDED Viewed

	@@ -0,0 +1,4 @@

+<tr>
+    <td style="text-align: left"><b>Dashboard:</b></td>
+    <td style="text-align: left"><b><a href="{{ dashboard_url }}" target="_blank">{{ dashboard_url }}</a></b></td>
+</tr>

.venv/lib/python3.11/site-packages/ray/widgets/templates/context_logo.html.j2 ADDED Viewed

	@@ -0,0 +1,13 @@

+<div class="jp-RenderedHTMLCommon" style="display: flex; flex-direction: row;">
+  <svg viewBox="0 0 567 224" fill="none" xmlns="http://www.w3.org/2000/svg" style="height: 3em;">
+    <g clip-path="url(#clip0_4338_178347)">
+        <path d="M341.29 165.561H355.29L330.13 129.051C345.63 123.991 354.21 112.051 354.21 94.2307C354.21 71.3707 338.72 58.1807 311.88 58.1807H271V165.561H283.27V131.661H311.8C314.25 131.661 316.71 131.501 319.01 131.351L341.25 165.561H341.29ZM283.29 119.851V70.0007H311.82C331.3 70.0007 342.34 78.2907 342.34 94.5507C342.34 111.271 331.34 119.861 311.82 119.861L283.29 119.851ZM451.4 138.411L463.4 165.561H476.74L428.74 58.1807H416L367.83 165.561H380.83L392.83 138.411H451.4ZM446.19 126.601H398L422 72.1407L446.24 126.601H446.19ZM526.11 128.741L566.91 58.1807H554.35L519.99 114.181L485.17 58.1807H472.44L514.01 129.181V165.541H526.13V128.741H526.11Z" fill="var(--jp-ui-font-color0)"/>
+        <path d="M82.35 104.44C84.0187 97.8827 87.8248 92.0678 93.1671 87.9146C98.5094 83.7614 105.083 81.5067 111.85 81.5067C118.617 81.5067 125.191 83.7614 130.533 87.9146C135.875 92.0678 139.681 97.8827 141.35 104.44H163.75C164.476 101.562 165.622 98.8057 167.15 96.2605L127.45 56.5605C121.071 60.3522 113.526 61.6823 106.235 60.3005C98.9443 58.9187 92.4094 54.9203 87.8602 49.0574C83.3109 43.1946 81.0609 35.8714 81.5332 28.4656C82.0056 21.0599 85.1679 14.0819 90.4252 8.8446C95.6824 3.60726 102.672 0.471508 110.08 0.0272655C117.487 -0.416977 124.802 1.86091 130.647 6.4324C136.493 11.0039 140.467 17.5539 141.821 24.8501C143.175 32.1463 141.816 39.6859 138 46.0505L177.69 85.7505C182.31 82.9877 187.58 81.4995 192.962 81.4375C198.345 81.3755 203.648 82.742 208.33 85.3976C213.012 88.0532 216.907 91.9029 219.616 96.5544C222.326 101.206 223.753 106.492 223.753 111.875C223.753 117.258 222.326 122.545 219.616 127.197C216.907 131.848 213.012 135.698 208.33 138.353C203.648 141.009 198.345 142.375 192.962 142.313C187.58 142.251 182.31 140.763 177.69 138L138 177.7C141.808 184.071 143.155 191.614 141.79 198.91C140.424 206.205 136.44 212.75 130.585 217.313C124.731 221.875 117.412 224.141 110.004 223.683C102.596 223.226 95.6103 220.077 90.3621 214.828C85.1139 209.58 81.9647 202.595 81.5072 195.187C81.0497 187.779 83.3154 180.459 87.878 174.605C92.4405 168.751 98.9853 164.766 106.281 163.401C113.576 162.035 121.119 163.383 127.49 167.19L167.19 127.49C165.664 124.941 164.518 122.182 163.79 119.3H141.39C139.721 125.858 135.915 131.673 130.573 135.826C125.231 139.98 118.657 142.234 111.89 142.234C105.123 142.234 98.5494 139.98 93.2071 135.826C87.8648 131.673 84.0587 125.858 82.39 119.3H60C58.1878 126.495 53.8086 132.78 47.6863 136.971C41.5641 141.163 34.1211 142.972 26.7579 142.059C19.3947 141.146 12.6191 137.574 7.70605 132.014C2.79302 126.454 0.0813599 119.29 0.0813599 111.87C0.0813599 104.451 2.79302 97.2871 7.70605 91.7272C12.6191 86.1673 19.3947 82.5947 26.7579 81.6817C34.1211 80.7686 41.5641 82.5781 47.6863 86.7696C53.8086 90.9611 58.1878 97.2456 60 104.44H82.35ZM100.86 204.32C103.407 206.868 106.759 208.453 110.345 208.806C113.93 209.159 117.527 208.258 120.522 206.256C123.517 204.254 125.725 201.276 126.771 197.828C127.816 194.38 127.633 190.677 126.253 187.349C124.874 184.021 122.383 181.274 119.205 179.577C116.027 177.88 112.359 177.337 108.826 178.042C105.293 178.746 102.113 180.654 99.8291 183.44C97.5451 186.226 96.2979 189.718 96.3 193.32C96.2985 195.364 96.7006 197.388 97.4831 199.275C98.2656 201.163 99.4132 202.877 100.86 204.32ZM204.32 122.88C206.868 120.333 208.453 116.981 208.806 113.396C209.159 109.811 208.258 106.214 206.256 103.219C204.254 100.223 201.275 98.0151 197.827 96.97C194.38 95.9249 190.676 96.1077 187.348 97.4873C184.02 98.8669 181.274 101.358 179.577 104.536C177.879 107.714 177.337 111.382 178.041 114.915C178.746 118.448 180.653 121.627 183.439 123.911C186.226 126.195 189.717 127.443 193.32 127.44C195.364 127.443 197.388 127.042 199.275 126.259C201.163 125.476 202.878 124.328 204.32 122.88ZM122.88 19.4205C120.333 16.8729 116.981 15.2876 113.395 14.9347C109.81 14.5817 106.213 15.483 103.218 17.4849C100.223 19.4868 98.0146 22.4654 96.9696 25.9131C95.9245 29.3608 96.1073 33.0642 97.4869 36.3922C98.8665 39.7202 101.358 42.4668 104.535 44.1639C107.713 45.861 111.381 46.4036 114.914 45.6992C118.447 44.9949 121.627 43.0871 123.911 40.301C126.195 37.515 127.442 34.0231 127.44 30.4205C127.44 28.3772 127.038 26.3539 126.255 24.4664C125.473 22.5788 124.326 20.8642 122.88 19.4205ZM19.42 100.86C16.8725 103.408 15.2872 106.76 14.9342 110.345C14.5813 113.93 15.4826 117.527 17.4844 120.522C19.4863 123.518 22.4649 125.726 25.9127 126.771C29.3604 127.816 33.0638 127.633 36.3918 126.254C39.7198 124.874 42.4664 122.383 44.1635 119.205C45.8606 116.027 46.4032 112.359 45.6988 108.826C44.9944 105.293 43.0866 102.114 40.3006 99.8296C37.5145 97.5455 34.0227 96.2983 30.42 96.3005C26.2938 96.3018 22.337 97.9421 19.42 100.86ZM100.86 100.86C98.3125 103.408 96.7272 106.76 96.3742 110.345C96.0213 113.93 96.9226 117.527 98.9244 120.522C100.926 123.518 103.905 125.726 107.353 126.771C110.8 127.816 114.504 127.633 117.832 126.254C121.16 124.874 123.906 122.383 125.604 119.205C127.301 116.027 127.843 112.359 127.139 108.826C126.434 105.293 124.527 102.114 121.741 99.8296C118.955 97.5455 115.463 96.2983 111.86 96.3005C109.817 96.299 107.793 96.701 105.905 97.4835C104.018 98.2661 102.303 99.4136 100.86 100.86Z" fill="#00AEEF"/>
+    </g>
+    <defs>
+        <clipPath id="clip0_4338_178347">
+            <rect width="566.93" height="223.75" fill="white"/>
+        </clipPath>
+    </defs>
+  </svg>
+</div>

.venv/lib/python3.11/site-packages/ray/widgets/templates/context_table.html.j2 ADDED Viewed

	@@ -0,0 +1,11 @@

+<table class="jp-RenderedHTMLCommon" style="border-collapse: collapse;color: var(--jp-ui-font-color1);font-size: var(--jp-ui-font-size1);">
+    <tr>
+        <td style="text-align: left"><b>Python version:</b></td>
+        <td style="text-align: left"><b>{{ python_version }}</b></td>
+    </tr>
+    <tr>
+        <td style="text-align: left"><b>Ray version:</b></td>
+        <td style="text-align: left"><b>{{ ray_version }}</b></td>
+    </tr>
+    {{ dashboard_row }}
+</table>

.venv/lib/python3.11/site-packages/ray/widgets/templates/divider.html.j2 ADDED Viewed

	@@ -0,0 +1,9 @@

+<div class="vDivider"></div>
+<style>
+.vDivider {
+  border-left-width: var(--jp-border-width);
+  border-left-color: var(--jp-border-color0);
+  border-left-style: solid;
+  margin: 0.5em 1em 0.5em 1em;
+}
+</style>

.venv/lib/python3.11/site-packages/ray/widgets/templates/rendered_html_common.html.j2 ADDED Viewed

	@@ -0,0 +1,3 @@

+<div class='jp-RenderedHTMLCommon'>
+  {{ content }}
+</div>