koichi12 commited on Feb 12, 2025

Commit

c8ebe32

verified ·

1 Parent(s): 6f8c8ab

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/lib/python3.11/site-packages/ray/air/__init__.py +22 -0
.venv/lib/python3.11/site-packages/ray/air/config.py +766 -0
.venv/lib/python3.11/site-packages/ray/air/constants.py +94 -0
.venv/lib/python3.11/site-packages/ray/air/data_batch_type.py +11 -0
.venv/lib/python3.11/site-packages/ray/air/execution/__init__.py +12 -0
.venv/lib/python3.11/site-packages/ray/air/execution/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/__init__.py +12 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/fixed.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/placement_group.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/request.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/resource_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/fixed.py +147 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/placement_group.py +214 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/request.py +255 -0
.venv/lib/python3.11/site-packages/ray/air/execution/resources/resource_manager.py +155 -0
.venv/lib/python3.11/site-packages/ray/air/result.py +283 -0
.venv/lib/python3.11/site-packages/ray/air/session.py +1 -0
.venv/lib/python3.11/site-packages/ray/air/util/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/pandas.py +1451 -0
.venv/lib/python3.11/site-packages/ray/air/util/torch_dist.py +191 -0
.venv/lib/python3.11/site-packages/ray/air/util/transform_pyarrow.py +39 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/__pycache__/deployment_state.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/handle_noop_latency.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/handle_throughput.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/http_noop_latency.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/microbenchmark.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/proxy_benchmark.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/common.py +276 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/handle_noop_latency.py +34 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/handle_throughput.py +62 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/http_noop_latency.py +32 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/microbenchmark.py +182 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/proxy_benchmark.py +294 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/__pycache__/common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/__pycache__/serialization_benchmark.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/common.py +29 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/serialization_benchmark.py +163 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/streaming_core_throughput.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/streaming_grpc_throughput.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/streaming_handle_throughput.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/streaming_http_throughput.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -155,3 +155,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/serve/_private/__pycache__/deployment_state.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/ray/air/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from ray.air.config import (
+    CheckpointConfig,
+    DatasetConfig,
+    FailureConfig,
+    RunConfig,
+    ScalingConfig,
+)
+from ray.air.data_batch_type import DataBatchType
+from ray.air.execution.resources.request import AcquiredResources, ResourceRequest
+from ray.air.result import Result
+__all__ = [
+    "DataBatchType",
+    "RunConfig",
+    "Result",
+    "ScalingConfig",
+    "DatasetConfig",
+    "FailureConfig",
+    "CheckpointConfig",
+    "AcquiredResources",
+    "ResourceRequest",
+]

.venv/lib/python3.11/site-packages/ray/air/config.py ADDED Viewed

	@@ -0,0 +1,766 @@

+import logging
+from collections import Counter, defaultdict
+from dataclasses import _MISSING_TYPE, dataclass, fields
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+)
+import pyarrow.fs
+from ray._private.ray_constants import RESOURCE_CONSTRAINT_PREFIX
+from ray._private.storage import _get_storage_uri
+from ray._private.thirdparty.tabulate.tabulate import tabulate
+from ray.data.preprocessor import Preprocessor
+from ray.util.annotations import Deprecated, PublicAPI
+from ray.widgets import Template, make_table_html_repr
+if TYPE_CHECKING:
+    from ray.train import SyncConfig
+    from ray.tune.callback import Callback
+    from ray.tune.execution.placement_groups import PlacementGroupFactory
+    from ray.tune.experimental.output import AirVerbosity
+    from ray.tune.search.sample import Domain
+    from ray.tune.stopper import Stopper
+    from ray.tune.utils.log import Verbosity
+# Dict[str, List] is to support `tune.grid_search`:
+# TODO(sumanthratna/matt): Upstream this to Tune.
+SampleRange = Union["Domain", Dict[str, List]]
+MAX = "max"
+MIN = "min"
+_DEPRECATED_VALUE = "DEPRECATED"
+DATASET_CONFIG_DEPRECATION_MSG = """
+Use `ray.train.DataConfig` instead of DatasetConfig to configure data ingest for training. See https://docs.ray.io/en/releases-2.6.3/ray-air/check-ingest.html#migrating-from-the-legacy-datasetconfig-api for more details.
+"""  # noqa: E501
+logger = logging.getLogger(__name__)
+def _repr_dataclass(obj, *, default_values: Optional[Dict[str, Any]] = None) -> str:
+    """A utility function to elegantly represent dataclasses.
+    In contrast to the default dataclass `__repr__`, which shows all parameters, this
+    function only shows parameters with non-default values.
+    Args:
+        obj: The dataclass to represent.
+        default_values: An optional dictionary that maps field names to default values.
+            Use this parameter to specify default values that are generated dynamically
+            (e.g., in `__post_init__` or by a `default_factory`). If a default value
+            isn't specified in `default_values`, then the default value is inferred from
+            the `dataclass`.
+    Returns:
+        A representation of the dataclass.
+    """
+    if default_values is None:
+        default_values = {}
+    non_default_values = {}  # Maps field name to value.
+    def equals(value, default_value):
+        # We need to special case None because of a bug in pyarrow:
+        # https://github.com/apache/arrow/issues/38535
+        if value is None and default_value is None:
+            return True
+        if value is None or default_value is None:
+            return False
+        return value == default_value
+    for field in fields(obj):
+        value = getattr(obj, field.name)
+        default_value = default_values.get(field.name, field.default)
+        is_required = isinstance(field.default, _MISSING_TYPE)
+        if is_required or not equals(value, default_value):
+            non_default_values[field.name] = value
+    string = f"{obj.__class__.__name__}("
+    string += ", ".join(
+        f"{name}={value!r}" for name, value in non_default_values.items()
+    )
+    string += ")"
+    return string
+@dataclass
+@PublicAPI(stability="stable")
+class ScalingConfig:
+    """Configuration for scaling training.
+    For more details, see :ref:`train_scaling_config`.
+    Args:
+        trainer_resources: Resources to allocate for the training coordinator.
+            The training coordinator launches the worker group and executes
+            the training function per worker, and this process does NOT require
+            GPUs. The coordinator is always scheduled on the same node as the
+            rank 0 worker, so one example use case is to set a minimum amount
+            of resources (e.g. CPU memory) required by the rank 0 node.
+            By default, this assigns 1 CPU to the training coordinator.
+        num_workers: The number of workers (Ray actors) to launch.
+            Each worker will reserve 1 CPU by default. The number of CPUs
+            reserved by each worker can be overridden with the
+            ``resources_per_worker`` argument.
+        use_gpu: If True, training will be done on GPUs (1 per worker).
+            Defaults to False. The number of GPUs reserved by each
+            worker can be overridden with the ``resources_per_worker``
+            argument.
+        resources_per_worker: If specified, the resources
+            defined in this Dict is reserved for each worker.
+            Define the ``"CPU"`` key (case-sensitive) to
+            override the number of CPUs used by each worker.
+            This can also be used to request :ref:`custom resources <custom-resources>`.
+        placement_strategy: The placement strategy to use for the
+            placement group of the Ray actors. See :ref:`Placement Group
+            Strategies <pgroup-strategy>` for the possible options.
+        accelerator_type: [Experimental] If specified, Ray Train will launch the
+            training coordinator and workers on the nodes with the specified type
+            of accelerators.
+            See :ref:`the available accelerator types <accelerator_types>`.
+            Ensure that your cluster has instances with the specified accelerator type
+            or is able to autoscale to fulfill the request.
+    Example:
+        .. code-block:: python
+            from ray.train import ScalingConfig
+            scaling_config = ScalingConfig(
+                # Number of distributed workers.
+                num_workers=2,
+                # Turn on/off GPU.
+                use_gpu=True,
+                # Assign extra CPU/GPU/custom resources per worker.
+                resources_per_worker={"GPU": 1, "CPU": 1, "memory": 1e9, "custom": 1.0},
+                # Try to schedule workers on different nodes.
+                placement_strategy="SPREAD",
+            )
+    """
+    trainer_resources: Optional[Union[Dict, SampleRange]] = None
+    num_workers: Union[int, SampleRange] = 1
+    use_gpu: Union[bool, SampleRange] = False
+    resources_per_worker: Optional[Union[Dict, SampleRange]] = None
+    placement_strategy: Union[str, SampleRange] = "PACK"
+    accelerator_type: Optional[str] = None
+    def __post_init__(self):
+        if self.resources_per_worker:
+            if not self.use_gpu and self.num_gpus_per_worker > 0:
+                raise ValueError(
+                    "`use_gpu` is False but `GPU` was found in "
+                    "`resources_per_worker`. Either set `use_gpu` to True or "
+                    "remove `GPU` from `resources_per_worker."
+                )
+            if self.use_gpu and self.num_gpus_per_worker == 0:
+                raise ValueError(
+                    "`use_gpu` is True but `GPU` is set to 0 in "
+                    "`resources_per_worker`. Either set `use_gpu` to False or "
+                    "request a positive number of `GPU` in "
+                    "`resources_per_worker."
+                )
+    def __repr__(self):
+        return _repr_dataclass(self)
+    def _repr_html_(self) -> str:
+        return make_table_html_repr(obj=self, title=type(self).__name__)
+    def __eq__(self, o: "ScalingConfig") -> bool:
+        if not isinstance(o, type(self)):
+            return False
+        return self.as_placement_group_factory() == o.as_placement_group_factory()
+    @property
+    def _resources_per_worker_not_none(self):
+        if self.resources_per_worker is None:
+            if self.use_gpu:
+                # Note that we don't request any CPUs, which avoids possible
+                # scheduling contention. Generally nodes have many more CPUs than
+                # GPUs, so not requesting a CPU does not lead to oversubscription.
+                resources_per_worker = {"GPU": 1}
+            else:
+                resources_per_worker = {"CPU": 1}
+        else:
+            resources_per_worker = {
+                k: v for k, v in self.resources_per_worker.items() if v != 0
+            }
+        if self.use_gpu:
+            resources_per_worker.setdefault("GPU", 1)
+        if self.accelerator_type:
+            accelerator = f"{RESOURCE_CONSTRAINT_PREFIX}{self.accelerator_type}"
+            resources_per_worker.setdefault(accelerator, 0.001)
+        return resources_per_worker
+    @property
+    def _trainer_resources_not_none(self):
+        if self.trainer_resources is None:
+            if self.num_workers:
+                # For Google Colab, don't allocate resources to the base Trainer.
+                # Colab only has 2 CPUs, and because of this resource scarcity,
+                # we have to be careful on where we allocate resources. Since Colab
+                # is not distributed, the concern about many parallel Ray Tune trials
+                # leading to all Trainers being scheduled on the head node if we set
+                # `trainer_resources` to 0 is no longer applicable.
+                try:
+                    import google.colab  # noqa: F401
+                    trainer_num_cpus = 0
+                except ImportError:
+                    trainer_num_cpus = 1
+            else:
+                # If there are no additional workers, then always reserve 1 CPU for
+                # the Trainer.
+                trainer_num_cpus = 1
+            trainer_resources = {"CPU": trainer_num_cpus}
+        else:
+            trainer_resources = {
+                k: v for k, v in self.trainer_resources.items() if v != 0
+            }
+        return trainer_resources
+    @property
+    def total_resources(self):
+        """Map of total resources required for the trainer."""
+        total_resource_map = defaultdict(float, self._trainer_resources_not_none)
+        for k, value in self._resources_per_worker_not_none.items():
+            total_resource_map[k] += value * self.num_workers
+        return dict(total_resource_map)
+    @property
+    def num_cpus_per_worker(self):
+        """The number of CPUs to set per worker."""
+        return self._resources_per_worker_not_none.get("CPU", 0)
+    @property
+    def num_gpus_per_worker(self):
+        """The number of GPUs to set per worker."""
+        return self._resources_per_worker_not_none.get("GPU", 0)
+    @property
+    def additional_resources_per_worker(self):
+        """Resources per worker, not including CPU or GPU resources."""
+        return {
+            k: v
+            for k, v in self._resources_per_worker_not_none.items()
+            if k not in ["CPU", "GPU"]
+        }
+    def as_placement_group_factory(self) -> "PlacementGroupFactory":
+        """Returns a PlacementGroupFactory to specify resources for Tune."""
+        from ray.tune.execution.placement_groups import PlacementGroupFactory
+        trainer_bundle = self._trainer_resources_not_none
+        worker_bundle = self._resources_per_worker_not_none
+        # Colocate Trainer and rank0 worker by merging their bundles
+        # Note: This empty bundle is required so that the Tune actor manager schedules
+        # the Trainable onto the combined bundle while taking none of its resources,
+        # rather than a non-empty head bundle.
+        combined_bundle = dict(Counter(trainer_bundle) + Counter(worker_bundle))
+        bundles = [{}, combined_bundle] + [worker_bundle] * (self.num_workers - 1)
+        return PlacementGroupFactory(bundles, strategy=self.placement_strategy)
+    @classmethod
+    def from_placement_group_factory(
+        cls, pgf: "PlacementGroupFactory"
+    ) -> "ScalingConfig":
+        """Create a ScalingConfig from a Tune's PlacementGroupFactory
+        Note that this is only needed for ResourceChangingScheduler, which
+        modifies a trial's PlacementGroupFactory but doesn't propagate
+        the changes to ScalingConfig. TrainTrainable needs to reconstruct
+        a ScalingConfig from on the trial's PlacementGroupFactory.
+        """
+        # pgf.bundles = [{trainer + worker}, {worker}, ..., {worker}]
+        num_workers = len(pgf.bundles)
+        combined_resources = pgf.bundles[0]
+        resources_per_worker = pgf.bundles[-1]
+        use_gpu = bool(resources_per_worker.get("GPU", False))
+        placement_strategy = pgf.strategy
+        # In `as_placement_group_factory`, we merged the trainer resource into the
+        # first worker resources bundle. We need to calculate the resources diff to
+        # get the trainer resources.
+        # Note: If there's only one worker, we won't be able to calculate the diff.
+        # We'll have empty trainer bundle and assign all resources to the worker.
+        trainer_resources = dict(
+            Counter(combined_resources) - Counter(resources_per_worker)
+        )
+        return ScalingConfig(
+            trainer_resources=trainer_resources,
+            num_workers=num_workers,
+            use_gpu=use_gpu,
+            resources_per_worker=resources_per_worker,
+            placement_strategy=placement_strategy,
+        )
+@dataclass
+@Deprecated(DATASET_CONFIG_DEPRECATION_MSG)
+class DatasetConfig:
+    """Configuration for ingest of a single Dataset.
+    See :ref:`the AIR Dataset configuration guide <data-ingest-torch>` for
+    usage examples.
+    This config defines how the Dataset should be read into the DataParallelTrainer.
+    It configures the preprocessing, splitting, and ingest strategy per-dataset.
+    DataParallelTrainers declare default DatasetConfigs for each dataset passed in the
+    ``datasets`` argument. Users have the opportunity to selectively override these
+    configs by passing the ``dataset_config`` argument. Trainers can also define user
+    customizable values (e.g., XGBoostTrainer doesn't support streaming ingest).
+    Args:
+        fit: Whether to fit preprocessors on this dataset. This can be set on at most
+            one dataset at a time. True by default for the "train" dataset only.
+        split: Whether the dataset should be split across multiple workers.
+            True by default for the "train" dataset only.
+        required: Whether to raise an error if the Dataset isn't provided by the user.
+            False by default.
+        transform: Whether to transform the dataset with the fitted preprocessor.
+            This must be enabled at least for the dataset that is fit.
+            True by default.
+        max_object_store_memory_fraction [Experimental]: The maximum fraction
+            of Ray's shared-memory object store to use for the dataset. The
+            default value is -1, meaning that the preprocessed dataset should
+            be cached, which may cause spilling if its size is larger than the
+            object store's capacity. Pipelined ingest (all other values, 0 or
+            higher) is experimental. Note that the absolute memory capacity
+            used is based on the object store capacity at invocation time; this
+            does not currently cover autoscaling cases where the size of the
+            cluster may change.
+        global_shuffle: Whether to enable global shuffle (per pipeline window
+            in streaming mode). Note that this is an expensive all-to-all operation,
+            and most likely you want to use local shuffle instead.
+            See https://docs.ray.io/en/master/data/faq.html and
+            https://docs.ray.io/en/master/ray-air/check-ingest.html.
+            False by default.
+        randomize_block_order: Whether to randomize the iteration order over blocks.
+            The main purpose of this is to prevent data fetching hotspots in the
+            cluster when running many parallel workers / trials on the same data.
+            We recommend enabling it always. True by default.
+        per_epoch_preprocessor [Experimental]: A preprocessor to re-apply on
+            each pass of the dataset. The main use case for this is to apply a
+            random transform on a training dataset on each epoch. The
+            per-epoch preprocessor will be applied *after* all other
+            preprocessors and in parallel with the dataset consumer.
+        use_stream_api: Deprecated. Use max_object_store_memory_fraction instead.
+        stream_window_size: Deprecated. Use max_object_store_memory_fraction instead.
+    """
+    # TODO(ekl) could we unify DataParallelTrainer and Trainer so the same data ingest
+    # strategy applies to all Trainers?
+    fit: Optional[bool] = None
+    split: Optional[bool] = None
+    required: Optional[bool] = None
+    transform: Optional[bool] = None
+    max_object_store_memory_fraction: Optional[float] = None
+    global_shuffle: Optional[bool] = None
+    randomize_block_order: Optional[bool] = None
+    per_epoch_preprocessor: Optional["Preprocessor"] = None
+    # Deprecated.
+    use_stream_api: Optional[int] = None
+    stream_window_size: Optional[int] = None
+    def __post_init__(self):
+        raise DeprecationWarning(DATASET_CONFIG_DEPRECATION_MSG)
+@dataclass
+@PublicAPI(stability="stable")
+class FailureConfig:
+    """Configuration related to failure handling of each training/tuning run.
+    Args:
+        max_failures: Tries to recover a run at least this many times.
+            Will recover from the latest checkpoint if present.
+            Setting to -1 will lead to infinite recovery retries.
+            Setting to 0 will disable retries. Defaults to 0.
+        fail_fast: Whether to fail upon the first error.
+            If fail_fast='raise' provided, the original error during training will be
+            immediately raised. fail_fast='raise' can easily leak resources and
+            should be used with caution.
+    """
+    max_failures: int = 0
+    fail_fast: Union[bool, str] = False
+    def __post_init__(self):
+        # Same check as in TuneController
+        if not (isinstance(self.fail_fast, bool) or self.fail_fast.upper() == "RAISE"):
+            raise ValueError(
+                "fail_fast must be one of {bool, 'raise'}. " f"Got {self.fail_fast}."
+            )
+        # Same check as in tune.run
+        if self.fail_fast and self.max_failures != 0:
+            raise ValueError(
+                f"max_failures must be 0 if fail_fast={repr(self.fail_fast)}."
+            )
+    def __repr__(self):
+        return _repr_dataclass(self)
+    def _repr_html_(self):
+        return Template("scrollableTable.html.j2").render(
+            table=tabulate(
+                {
+                    "Setting": ["Max failures", "Fail fast"],
+                    "Value": [self.max_failures, self.fail_fast],
+                },
+                tablefmt="html",
+                showindex=False,
+                headers="keys",
+            ),
+            max_height="none",
+        )
+@dataclass
+@PublicAPI(stability="stable")
+class CheckpointConfig:
+    """Configurable parameters for defining the checkpointing strategy.
+    Default behavior is to persist all checkpoints to disk. If
+    ``num_to_keep`` is set, the default retention policy is to keep the
+    checkpoints with maximum timestamp, i.e. the most recent checkpoints.
+    Args:
+        num_to_keep: The number of checkpoints to keep
+            on disk for this run. If a checkpoint is persisted to disk after
+            there are already this many checkpoints, then an existing
+            checkpoint will be deleted. If this is ``None`` then checkpoints
+            will not be deleted. Must be >= 1.
+        checkpoint_score_attribute: The attribute that will be used to
+            score checkpoints to determine which checkpoints should be kept
+            on disk when there are greater than ``num_to_keep`` checkpoints.
+            This attribute must be a key from the checkpoint
+            dictionary which has a numerical value. Per default, the last
+            checkpoints will be kept.
+        checkpoint_score_order: Either "max" or "min".
+            If "max", then checkpoints with highest values of
+            ``checkpoint_score_attribute`` will be kept.
+            If "min", then checkpoints with lowest values of
+            ``checkpoint_score_attribute`` will be kept.
+        checkpoint_frequency: Number of iterations between checkpoints. If 0
+            this will disable checkpointing.
+            Please note that most trainers will still save one checkpoint at
+            the end of training.
+            This attribute is only supported
+            by trainers that don't take in custom training loops.
+        checkpoint_at_end: If True, will save a checkpoint at the end of training.
+            This attribute is only supported by trainers that don't take in
+            custom training loops. Defaults to True for trainers that support it
+            and False for generic function trainables.
+        _checkpoint_keep_all_ranks: This experimental config is deprecated.
+            This behavior is now controlled by reporting `checkpoint=None`
+            in the workers that shouldn't persist a checkpoint.
+            For example, if you only want the rank 0 worker to persist a checkpoint
+            (e.g., in standard data parallel training), then you should save and
+            report a checkpoint if `ray.train.get_context().get_world_rank() == 0`
+            and `None` otherwise.
+        _checkpoint_upload_from_workers: This experimental config is deprecated.
+            Uploading checkpoint directly from the worker is now the default behavior.
+    """
+    num_to_keep: Optional[int] = None
+    checkpoint_score_attribute: Optional[str] = None
+    checkpoint_score_order: Optional[str] = MAX
+    checkpoint_frequency: Optional[int] = 0
+    checkpoint_at_end: Optional[bool] = None
+    _checkpoint_keep_all_ranks: Optional[bool] = _DEPRECATED_VALUE
+    _checkpoint_upload_from_workers: Optional[bool] = _DEPRECATED_VALUE
+    def __post_init__(self):
+        if self._checkpoint_keep_all_ranks != _DEPRECATED_VALUE:
+            raise DeprecationWarning(
+                "The experimental `_checkpoint_keep_all_ranks` config is deprecated. "
+                "This behavior is now controlled by reporting `checkpoint=None` "
+                "in the workers that shouldn't persist a checkpoint. "
+                "For example, if you only want the rank 0 worker to persist a "
+                "checkpoint (e.g., in standard data parallel training), "
+                "then you should save and report a checkpoint if "
+                "`ray.train.get_context().get_world_rank() == 0` "
+                "and `None` otherwise."
+            )
+        if self._checkpoint_upload_from_workers != _DEPRECATED_VALUE:
+            raise DeprecationWarning(
+                "The experimental `_checkpoint_upload_from_workers` config is "
+                "deprecated. Uploading checkpoint directly from the worker is "
+                "now the default behavior."
+            )
+        if self.num_to_keep is not None and self.num_to_keep <= 0:
+            raise ValueError(
+                f"Received invalid num_to_keep: "
+                f"{self.num_to_keep}. "
+                f"Must be None or an integer >= 1."
+            )
+        if self.checkpoint_score_order not in (MAX, MIN):
+            raise ValueError(
+                f"checkpoint_score_order must be either " f'"{MAX}" or "{MIN}".'
+            )
+        if self.checkpoint_frequency < 0:
+            raise ValueError(
+                f"checkpoint_frequency must be >=0, got {self.checkpoint_frequency}"
+            )
+    def __repr__(self):
+        return _repr_dataclass(self)
+    def _repr_html_(self) -> str:
+        if self.num_to_keep is None:
+            num_to_keep_repr = "All"
+        else:
+            num_to_keep_repr = self.num_to_keep
+        if self.checkpoint_score_attribute is None:
+            checkpoint_score_attribute_repr = "Most recent"
+        else:
+            checkpoint_score_attribute_repr = self.checkpoint_score_attribute
+        if self.checkpoint_at_end is None:
+            checkpoint_at_end_repr = ""
+        else:
+            checkpoint_at_end_repr = self.checkpoint_at_end
+        return Template("scrollableTable.html.j2").render(
+            table=tabulate(
+                {
+                    "Setting": [
+                        "Number of checkpoints to keep",
+                        "Checkpoint score attribute",
+                        "Checkpoint score order",
+                        "Checkpoint frequency",
+                        "Checkpoint at end",
+                    ],
+                    "Value": [
+                        num_to_keep_repr,
+                        checkpoint_score_attribute_repr,
+                        self.checkpoint_score_order,
+                        self.checkpoint_frequency,
+                        checkpoint_at_end_repr,
+                    ],
+                },
+                tablefmt="html",
+                showindex=False,
+                headers="keys",
+            ),
+            max_height="none",
+        )
+    @property
+    def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]:
+        """Same as ``checkpoint_score_attr`` in ``tune.run``.
+        Only used for Legacy API compatibility.
+        """
+        if self.checkpoint_score_attribute is None:
+            return self.checkpoint_score_attribute
+        prefix = ""
+        if self.checkpoint_score_order == MIN:
+            prefix = "min-"
+        return f"{prefix}{self.checkpoint_score_attribute}"
+@dataclass
+@PublicAPI(stability="stable")
+class RunConfig:
+    """Runtime configuration for training and tuning runs.
+    Upon resuming from a training or tuning run checkpoint,
+    Ray Train/Tune will automatically apply the RunConfig from
+    the previously checkpointed run.
+    Args:
+        name: Name of the trial or experiment. If not provided, will be deduced
+            from the Trainable.
+        storage_path: [Beta] Path where all results and checkpoints are persisted.
+            Can be a local directory or a destination on cloud storage.
+            For multi-node training/tuning runs, this must be set to a
+            shared storage location (e.g., S3, NFS).
+            This defaults to the local ``~/ray_results`` directory.
+        storage_filesystem: [Beta] A custom filesystem to use for storage.
+            If this is provided, `storage_path` should be a path with its
+            prefix stripped (e.g., `s3://bucket/path` -> `bucket/path`).
+        failure_config: Failure mode configuration.
+        checkpoint_config: Checkpointing configuration.
+        sync_config: Configuration object for syncing. See train.SyncConfig.
+        verbose: 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = default, 2 = verbose. Defaults to 1.
+            If the ``RAY_AIR_NEW_OUTPUT=1`` environment variable is set,
+            uses the old verbosity settings:
+            0 = silent, 1 = only status updates, 2 = status and brief
+            results, 3 = status and detailed results.
+        stop: Stop conditions to consider. Refer to ray.tune.stopper.Stopper
+            for more info. Stoppers should be serializable.
+        callbacks: [DeveloperAPI] Callbacks to invoke.
+            Refer to ray.tune.callback.Callback for more info.
+            Callbacks should be serializable.
+            Currently only stateless callbacks are supported for resumed runs.
+            (any state of the callback will not be checkpointed by Tune
+            and thus will not take effect in resumed runs).
+        progress_reporter: [DeveloperAPI] Progress reporter for reporting
+            intermediate experiment progress. Defaults to CLIReporter if
+            running in command-line, or JupyterNotebookReporter if running in
+            a Jupyter notebook.
+        log_to_file: [DeveloperAPI] Log stdout and stderr to files in
+            trial directories. If this is `False` (default), no files
+            are written. If `true`, outputs are written to `trialdir/stdout`
+            and `trialdir/stderr`, respectively. If this is a single string,
+            this is interpreted as a file relative to the trialdir, to which
+            both streams are written. If this is a Sequence (e.g. a Tuple),
+            it has to have length 2 and the elements indicate the files to
+            which stdout and stderr are written, respectively.
+    """
+    name: Optional[str] = None
+    storage_path: Optional[str] = None
+    storage_filesystem: Optional[pyarrow.fs.FileSystem] = None
+    failure_config: Optional[FailureConfig] = None
+    checkpoint_config: Optional[CheckpointConfig] = None
+    sync_config: Optional["SyncConfig"] = None
+    verbose: Optional[Union[int, "AirVerbosity", "Verbosity"]] = None
+    stop: Optional[Union[Mapping, "Stopper", Callable[[str, Mapping], bool]]] = None
+    callbacks: Optional[List["Callback"]] = None
+    progress_reporter: Optional[
+        "ray.tune.progress_reporter.ProgressReporter"  # noqa: F821
+    ] = None
+    log_to_file: Union[bool, str, Tuple[str, str]] = False
+    # Deprecated
+    local_dir: Optional[str] = None
+    def __post_init__(self):
+        from ray.train import SyncConfig
+        from ray.train.constants import DEFAULT_STORAGE_PATH
+        from ray.tune.experimental.output import AirVerbosity, get_air_verbosity
+        if self.local_dir is not None:
+            raise DeprecationWarning(
+                "The `RunConfig(local_dir)` argument is deprecated. "
+                "You should set the `RunConfig(storage_path)` instead."
+                "See the docs: https://docs.ray.io/en/latest/train/user-guides/"
+                "persistent-storage.html#setting-the-local-staging-directory"
+            )
+        if self.storage_path is None:
+            # TODO(justinvyu): [Deprecated] Remove in 2.30
+            self.storage_path = DEFAULT_STORAGE_PATH
+            # If no remote path is set, try to get Ray Storage URI
+            ray_storage_uri: Optional[str] = _get_storage_uri()
+            if ray_storage_uri is not None:
+                logger.info(
+                    "Using configured Ray Storage URI as the `storage_path`: "
+                    f"{ray_storage_uri}"
+                )
+                self.storage_path = ray_storage_uri
+        if not self.failure_config:
+            self.failure_config = FailureConfig()
+        if not self.sync_config:
+            self.sync_config = SyncConfig()
+        if not self.checkpoint_config:
+            self.checkpoint_config = CheckpointConfig()
+        if self.verbose is None:
+            # Default `verbose` value. For new output engine,
+            # this is AirVerbosity.DEFAULT.
+            # For old output engine, this is Verbosity.V3_TRIAL_DETAILS
+            # Todo (krfricke): Currently uses number to pass test_configs::test_repr
+            self.verbose = get_air_verbosity(AirVerbosity.DEFAULT) or 3
+        if isinstance(self.storage_path, Path):
+            self.storage_path = self.storage_path.as_posix()
+    def __repr__(self):
+        from ray.train import SyncConfig
+        return _repr_dataclass(
+            self,
+            default_values={
+                "failure_config": FailureConfig(),
+                "sync_config": SyncConfig(),
+                "checkpoint_config": CheckpointConfig(),
+            },
+        )
+    def _repr_html_(self) -> str:
+        reprs = []
+        if self.failure_config is not None:
+            reprs.append(
+                Template("title_data_mini.html.j2").render(
+                    title="Failure Config", data=self.failure_config._repr_html_()
+                )
+            )
+        if self.sync_config is not None:
+            reprs.append(
+                Template("title_data_mini.html.j2").render(
+                    title="Sync Config", data=self.sync_config._repr_html_()
+                )
+            )
+        if self.checkpoint_config is not None:
+            reprs.append(
+                Template("title_data_mini.html.j2").render(
+                    title="Checkpoint Config", data=self.checkpoint_config._repr_html_()
+                )
+            )
+        # Create a divider between each displayed repr
+        subconfigs = [Template("divider.html.j2").render()] * (2 * len(reprs) - 1)
+        subconfigs[::2] = reprs
+        settings = Template("scrollableTable.html.j2").render(
+            table=tabulate(
+                {
+                    "Name": self.name,
+                    "Local results directory": self.local_dir,
+                    "Verbosity": self.verbose,
+                    "Log to file": self.log_to_file,
+                }.items(),
+                tablefmt="html",
+                headers=["Setting", "Value"],
+                showindex=False,
+            ),
+            max_height="300px",
+        )
+        return Template("title_data.html.j2").render(
+            title="RunConfig",
+            data=Template("run_config.html.j2").render(
+                subconfigs=subconfigs,
+                settings=settings,
+            ),
+        )

.venv/lib/python3.11/site-packages/ray/air/constants.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Key to denote the preprocessor in the checkpoint dict.
+PREPROCESSOR_KEY = "_preprocessor"
+# Key to denote the model in the checkpoint dict.
+MODEL_KEY = "model"
+# Key to denote which dataset is the evaluation dataset.
+# Only used in trainers which do not support multiple
+# evaluation datasets.
+EVALUATION_DATASET_KEY = "evaluation"
+# Key to denote which dataset is the training dataset.
+# This is the dataset that the preprocessor is fit on.
+TRAIN_DATASET_KEY = "train"
+# Name to use for the column when representing tensors in table format.
+TENSOR_COLUMN_NAME = "__value__"
+# The maximum length of strings returned by `__repr__` for AIR objects constructed with
+# default values.
+MAX_REPR_LENGTH = int(80 * 1.5)
+# Timeout used when putting exceptions raised by runner thread into the queue.
+_ERROR_REPORT_TIMEOUT = 10
+# Timeout when fetching new results after signaling the training function to continue.
+_RESULT_FETCH_TIMEOUT = 0.2
+# Timeout for fetching exceptions raised by the training function.
+_ERROR_FETCH_TIMEOUT = 1
+# The key used to identify whether we have already warned about ray.air.session
+# functions being used outside of the session
+SESSION_MISUSE_LOG_ONCE_KEY = "air_warn_session_misuse"
+# Name of attribute in Checkpoint storing current Tune ID for restoring
+# training with Ray Train
+CHECKPOINT_ID_ATTR = "_current_checkpoint_id"
+# Name of the marker dropped by the Trainable. If a worker detects
+# the presence of the marker in the trial dir, it will use lazy
+# checkpointing.
+LAZY_CHECKPOINT_MARKER_FILE = ".lazy_checkpoint_marker"
+# The timestamp of when the result is generated.
+# Default to when the result is processed by tune.
+TIMESTAMP = "timestamp"
+# (Auto-filled) Time in seconds this iteration took to run.
+# This may be overridden to override the system-computed time difference.
+TIME_THIS_ITER_S = "time_this_iter_s"
+# (Auto-filled) The index of this training iteration.
+TRAINING_ITERATION = "training_iteration"
+# File that stores parameters of the trial.
+EXPR_PARAM_FILE = "params.json"
+# Pickle File that stores parameters of the trial.
+EXPR_PARAM_PICKLE_FILE = "params.pkl"
+# File that stores the progress of the trial.
+EXPR_PROGRESS_FILE = "progress.csv"
+# File that stores results of the trial.
+EXPR_RESULT_FILE = "result.json"
+# File that stores the pickled error file
+EXPR_ERROR_PICKLE_FILE = "error.pkl"
+# File that stores the error file
+EXPR_ERROR_FILE = "error.txt"
+# File that stores the checkpoint metadata
+CHECKPOINT_TUNE_METADATA_FILE = ".tune_metadata"
+# ==================================================
+#               Environment Variables
+# ==================================================
+# Integer value which if set will copy files in reported AIR directory
+# checkpoints instead of moving them (if worker is on the same node as Trainable)
+COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV = (
+    "TRAIN_COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING"
+)
+# NOTE: When adding a new environment variable, please track it in this list.
+# TODO(ml-team): Most env var constants should get moved here.
+AIR_ENV_VARS = {
+    COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV,
+    "RAY_AIR_FULL_TRACEBACKS",
+    "RAY_AIR_NEW_OUTPUT",
+}

.venv/lib/python3.11/site-packages/ray/air/data_batch_type.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from typing import TYPE_CHECKING, Dict, Union
+if TYPE_CHECKING:
+    import numpy
+    import pandas  # noqa: F401
+    import pyarrow
+# TODO de-dup with ray.data.block.DataBatch
+DataBatchType = Union[
+    "numpy.ndarray", "pyarrow.Table" "pandas.DataFrame", Dict[str, "numpy.ndarray"]
+]

.venv/lib/python3.11/site-packages/ray/air/execution/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from ray.air.execution.resources.fixed import FixedResourceManager
+from ray.air.execution.resources.placement_group import PlacementGroupResourceManager
+from ray.air.execution.resources.request import AcquiredResources, ResourceRequest
+from ray.air.execution.resources.resource_manager import ResourceManager
+__all__ = [
+    "ResourceRequest",
+    "AcquiredResources",
+    "ResourceManager",
+    "FixedResourceManager",
+    "PlacementGroupResourceManager",
+]

.venv/lib/python3.11/site-packages/ray/air/execution/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (679 Bytes). View file

.venv/lib/python3.11/site-packages/ray/air/execution/resources/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from ray.air.execution.resources.fixed import FixedResourceManager
+from ray.air.execution.resources.placement_group import PlacementGroupResourceManager
+from ray.air.execution.resources.request import AcquiredResources, ResourceRequest
+from ray.air.execution.resources.resource_manager import ResourceManager
+__all__ = [
+    "ResourceRequest",
+    "AcquiredResources",
+    "ResourceManager",
+    "FixedResourceManager",
+    "PlacementGroupResourceManager",
+]

.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/fixed.cpython-311.pyc ADDED Viewed

Binary file (7.71 kB). View file

.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/placement_group.cpython-311.pyc ADDED Viewed

Binary file (10.7 kB). View file

.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/request.cpython-311.pyc ADDED Viewed

Binary file (12.5 kB). View file

.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/resource_manager.cpython-311.pyc ADDED Viewed

Binary file (7.74 kB). View file

.venv/lib/python3.11/site-packages/ray/air/execution/resources/fixed.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional
+import ray
+from ray import LOCAL_MODE, SCRIPT_MODE
+from ray.air.execution.resources.request import (
+    AcquiredResources,
+    RemoteRayEntity,
+    ResourceRequest,
+)
+from ray.air.execution.resources.resource_manager import ResourceManager
+from ray.util.annotations import DeveloperAPI
+# Avoid numerical errors by multiplying and subtracting with this number.
+# Compare: 0.99 - 0.33 = 0.65999... vs (0.99 * 1000 - 0.33 * 1000) / 1000 = 0.66
+_DIGITS = 100000
+@DeveloperAPI
+@dataclass
+class FixedAcquiredResources(AcquiredResources):
+    bundles: List[Dict[str, float]]
+    def _annotate_remote_entity(
+        self, entity: RemoteRayEntity, bundle: Dict[str, float], bundle_index: int
+    ) -> RemoteRayEntity:
+        bundle = bundle.copy()
+        num_cpus = bundle.pop("CPU", 0)
+        num_gpus = bundle.pop("GPU", 0)
+        memory = bundle.pop("memory", 0.0)
+        return entity.options(
+            num_cpus=num_cpus,
+            num_gpus=num_gpus,
+            memory=memory,
+            resources=bundle,
+        )
+@DeveloperAPI
+class FixedResourceManager(ResourceManager):
+    """Fixed budget based resource manager.
+    This resource manager keeps track of a fixed set of resources. When resources
+    are acquired, they are subtracted from the budget. When resources are freed,
+    they are added back to the budget.
+    The resource manager still requires resources to be requested before they become
+    available. However, because the resource requests are virtual, this will not
+    trigger autoscaling.
+    Additionally, resources are not reserved on request, only on acquisition. Thus,
+    acquiring a resource can change the availability of other requests. Note that
+    this behavior may be changed in future implementations.
+    The fixed resource manager does not support placement strategies. Using
+    ``STRICT_SPREAD`` will result in an error. ``STRICT_PACK`` will succeed only
+    within a placement group bundle. All other placement group arguments will be
+    ignored.
+    Args:
+        total_resources: Budget of resources to manage. Defaults to all available
+            resources in the current task or all cluster resources (if outside a task).
+    """
+    _resource_cls: AcquiredResources = FixedAcquiredResources
+    def __init__(self, total_resources: Optional[Dict[str, float]] = None):
+        rtc = ray.get_runtime_context()
+        if not total_resources:
+            if rtc.worker.mode in {None, SCRIPT_MODE, LOCAL_MODE}:
+                total_resources = ray.cluster_resources()
+            else:
+                total_resources = rtc.get_assigned_resources()
+        # If we are in a placement group, all of our resources will be in a bundle
+        # and thus fulfill requirements of STRICT_PACK - but only if child tasks
+        # are captured by the pg.
+        self._allow_strict_pack = (
+            ray.util.get_current_placement_group() is not None
+            and rtc.should_capture_child_tasks_in_placement_group
+        )
+        self._total_resources = total_resources
+        self._requested_resources = []
+        self._used_resources = []
+    @property
+    def _available_resources(self) -> Dict[str, float]:
+        available_resources = self._total_resources.copy()
+        for used_resources in self._used_resources:
+            all_resources = used_resources.required_resources
+            for k, v in all_resources.items():
+                available_resources[k] = (
+                    available_resources[k] * _DIGITS - v * _DIGITS
+                ) / _DIGITS
+        return available_resources
+    def request_resources(self, resource_request: ResourceRequest):
+        if resource_request.strategy == "STRICT_SPREAD" or (
+            not self._allow_strict_pack and resource_request.strategy == "STRICT_PACK"
+        ):
+            raise RuntimeError(
+                f"Requested a resource with placement strategy "
+                f"{resource_request.strategy}, but this cannot be fulfilled by a "
+                f"FixedResourceManager. In a nested setting, please set the inner "
+                f"placement strategy to be less restrictive (i.e. no STRICT_ strategy)."
+            )
+        self._requested_resources.append(resource_request)
+    def cancel_resource_request(self, resource_request: ResourceRequest):
+        self._requested_resources.remove(resource_request)
+    def has_resources_ready(self, resource_request: ResourceRequest) -> bool:
+        if resource_request not in self._requested_resources:
+            return False
+        available_resources = self._available_resources
+        all_resources = resource_request.required_resources
+        for k, v in all_resources.items():
+            if available_resources.get(k, 0.0) < v:
+                return False
+        return True
+    def acquire_resources(
+        self, resource_request: ResourceRequest
+    ) -> Optional[AcquiredResources]:
+        if not self.has_resources_ready(resource_request):
+            return None
+        self._used_resources.append(resource_request)
+        return self._resource_cls(
+            bundles=resource_request.bundles, resource_request=resource_request
+        )
+    def free_resources(self, acquired_resource: AcquiredResources):
+        resources = acquired_resource.resource_request
+        self._used_resources.remove(resources)
+    def clear(self):
+        # Reset internal state
+        self._requested_resources = []
+        self._used_resources = []

.venv/lib/python3.11/site-packages/ray/air/execution/resources/placement_group.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set
+import ray
+from ray.air.execution.resources.request import (
+    AcquiredResources,
+    RemoteRayEntity,
+    ResourceRequest,
+)
+from ray.air.execution.resources.resource_manager import ResourceManager
+from ray.util.annotations import DeveloperAPI
+from ray.util.placement_group import PlacementGroup, remove_placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+@DeveloperAPI
+@dataclass
+class PlacementGroupAcquiredResources(AcquiredResources):
+    placement_group: PlacementGroup
+    def _annotate_remote_entity(
+        self, entity: RemoteRayEntity, bundle: Dict[str, float], bundle_index: int
+    ) -> RemoteRayEntity:
+        bundle = bundle.copy()
+        num_cpus = bundle.pop("CPU", 0)
+        num_gpus = bundle.pop("GPU", 0)
+        memory = bundle.pop("memory", 0.0)
+        return entity.options(
+            scheduling_strategy=PlacementGroupSchedulingStrategy(
+                placement_group=self.placement_group,
+                placement_group_bundle_index=bundle_index,
+                placement_group_capture_child_tasks=True,
+            ),
+            num_cpus=num_cpus,
+            num_gpus=num_gpus,
+            memory=memory,
+            resources=bundle,
+        )
+@DeveloperAPI
+class PlacementGroupResourceManager(ResourceManager):
+    """Resource manager using placement groups as the resource backend.
+    This manager will use placement groups to fulfill resource requests. Requesting
+    a resource will schedule the placement group. Acquiring a resource will
+    return a ``PlacementGroupAcquiredResources`` that can be used to schedule
+    Ray tasks and actors on the placement group. Freeing an acquired resource
+    will destroy the associated placement group.
+    Ray core does not emit events when resources are available. Instead, the
+    scheduling state has to be periodically updated.
+    Per default, placement group scheduling state is refreshed every time when
+    resource state is inquired, but not more often than once every ``update_interval_s``
+    seconds. Alternatively, staging futures can be retrieved (and awaited) with
+    ``get_resource_futures()`` and state update can be force with ``update_state()``.
+    Args:
+        update_interval_s: Minimum interval in seconds between updating scheduling
+            state of placement groups.
+    """
+    _resource_cls: AcquiredResources = PlacementGroupAcquiredResources
+    def __init__(self, update_interval_s: float = 0.1):
+        # Internally, the placement group lifecycle is like this:
+        # - Resources are requested with ``request_resources()``
+        # - A placement group is scheduled ("staged")
+        # - A ``PlacementGroup.ready()`` future is scheduled ("staging future")
+        # - We update the scheduling state when we need to
+        #   (e.g. when ``has_resources_ready()`` is called)
+        # - When staging futures resolve, a placement group is moved from "staging"
+        #   to "ready"
+        # - When a resource request is canceled, we remove a placement group from
+        #   "staging". If there are not staged placement groups
+        #   (because they are already "ready"), we remove one from "ready" instead.
+        # - When a resource is acquired, the pg is removed from "ready" and moved
+        #   to "acquired"
+        # - When a resource is freed, the pg is removed from "acquired" and destroyed
+        # Mapping of placement group to request
+        self._pg_to_request: Dict[PlacementGroup, ResourceRequest] = {}
+        # PGs that are staged but not "ready", yet (i.e. not CREATED)
+        self._request_to_staged_pgs: Dict[
+            ResourceRequest, Set[PlacementGroup]
+        ] = defaultdict(set)
+        # PGs that are CREATED and can be used by tasks and actors
+        self._request_to_ready_pgs: Dict[
+            ResourceRequest, Set[PlacementGroup]
+        ] = defaultdict(set)
+        # Staging futures used to update internal state.
+        # We keep a double mapping here for better lookup efficiency.
+        self._staging_future_to_pg: Dict[ray.ObjectRef, PlacementGroup] = dict()
+        self._pg_to_staging_future: Dict[PlacementGroup, ray.ObjectRef] = dict()
+        # Set of acquired PGs. We keep track of these here to make sure we
+        # only free PGs that this manager managed.
+        self._acquired_pgs: Set[PlacementGroup] = set()
+        # Minimum time between updates of the internal state
+        self.update_interval_s = update_interval_s
+        self._last_update = time.monotonic() - self.update_interval_s - 1
+    def get_resource_futures(self) -> List[ray.ObjectRef]:
+        return list(self._staging_future_to_pg.keys())
+    def _maybe_update_state(self):
+        now = time.monotonic()
+        if now > self._last_update + self.update_interval_s:
+            self.update_state()
+    def update_state(self):
+        ready, not_ready = ray.wait(
+            list(self._staging_future_to_pg.keys()),
+            num_returns=len(self._staging_future_to_pg),
+            timeout=0,
+        )
+        for future in ready:
+            # Remove staging future
+            pg = self._staging_future_to_pg.pop(future)
+            self._pg_to_staging_future.pop(pg)
+            # Fetch resource request
+            request = self._pg_to_request[pg]
+            # Remove from staging, add to ready
+            self._request_to_staged_pgs[request].remove(pg)
+            self._request_to_ready_pgs[request].add(pg)
+        self._last_update = time.monotonic()
+    def request_resources(self, resource_request: ResourceRequest):
+        pg = resource_request.to_placement_group()
+        self._pg_to_request[pg] = resource_request
+        self._request_to_staged_pgs[resource_request].add(pg)
+        future = pg.ready()
+        self._staging_future_to_pg[future] = pg
+        self._pg_to_staging_future[pg] = future
+    def cancel_resource_request(self, resource_request: ResourceRequest):
+        if self._request_to_staged_pgs[resource_request]:
+            pg = self._request_to_staged_pgs[resource_request].pop()
+            # PG was staging
+            future = self._pg_to_staging_future.pop(pg)
+            self._staging_future_to_pg.pop(future)
+            # Cancel the pg.ready task.
+            # Otherwise, it will be pending node assignment forever.
+            ray.cancel(future)
+        else:
+            # PG might be ready
+            pg = self._request_to_ready_pgs[resource_request].pop()
+            if not pg:
+                raise RuntimeError(
+                    "Cannot cancel resource request: No placement group was "
+                    f"staged or is ready. Make sure to not cancel more resource "
+                    f"requests than you've created. Request: {resource_request}"
+                )
+        self._pg_to_request.pop(pg)
+        ray.util.remove_placement_group(pg)
+    def has_resources_ready(self, resource_request: ResourceRequest) -> bool:
+        if not bool(len(self._request_to_ready_pgs[resource_request])):
+            # Only update state if needed
+            self._maybe_update_state()
+        return bool(len(self._request_to_ready_pgs[resource_request]))
+    def acquire_resources(
+        self, resource_request: ResourceRequest
+    ) -> Optional[PlacementGroupAcquiredResources]:
+        if not self.has_resources_ready(resource_request):
+            return None
+        pg = self._request_to_ready_pgs[resource_request].pop()
+        self._acquired_pgs.add(pg)
+        return self._resource_cls(placement_group=pg, resource_request=resource_request)
+    def free_resources(self, acquired_resource: PlacementGroupAcquiredResources):
+        pg = acquired_resource.placement_group
+        self._acquired_pgs.remove(pg)
+        remove_placement_group(pg)
+        self._pg_to_request.pop(pg)
+    def clear(self):
+        if not ray.is_initialized():
+            return
+        for staged_pgs in self._request_to_staged_pgs.values():
+            for staged_pg in staged_pgs:
+                remove_placement_group(staged_pg)
+        for ready_pgs in self._request_to_ready_pgs.values():
+            for ready_pg in ready_pgs:
+                remove_placement_group(ready_pg)
+        for acquired_pg in self._acquired_pgs:
+            remove_placement_group(acquired_pg)
+        # Reset internal state
+        self.__init__(update_interval_s=self.update_interval_s)
+    def __del__(self):
+        self.clear()

.venv/lib/python3.11/site-packages/ray/air/execution/resources/request.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import abc
+import json
+from copy import deepcopy
+from dataclasses import dataclass
+from inspect import signature
+from typing import Dict, List, Union
+import ray
+from ray.util import placement_group
+from ray.util.annotations import DeveloperAPI
+RemoteRayEntity = Union[ray.remote_function.RemoteFunction, ray.actor.ActorClass]
+def _sum_bundles(bundles: List[Dict[str, float]]) -> Dict[str, float]:
+    """Sum all resources in a list of resource bundles.
+    Args:
+        bundles: List of resource bundles.
+    Returns: Dict containing all resources summed up.
+    """
+    resources = {}
+    for bundle in bundles:
+        for k, v in bundle.items():
+            resources[k] = resources.get(k, 0) + v
+    return resources
+@DeveloperAPI
+class ResourceRequest:
+    """Request for resources.
+    This class is used to define a resource request. A resource request comprises one
+    or more bundles of resources and instructions on the scheduling behavior.
+    The resource request can be submitted to a resource manager, which will
+    schedule the resources. Depending on the resource backend, this may instruct
+    Ray to scale up (autoscaling).
+    Resource requests are compatible with the most fine-grained low-level resource
+    backend, which are Ray placement groups.
+    Args:
+        bundles: A list of bundles which represent the resources requirements.
+            E.g. ``[{"CPU": 1, "GPU": 1}]``.
+        strategy: The scheduling strategy to acquire the bundles.
+         - "PACK": Packs Bundles into as few nodes as possible.
+         - "SPREAD": Places Bundles across distinct nodes as even as possible.
+         - "STRICT_PACK": Packs Bundles into one node. The group is
+           not allowed to span multiple nodes.
+         - "STRICT_SPREAD": Packs Bundles across distinct nodes.
+        *args: Passed to the call of ``placement_group()``, if applicable.
+        **kwargs: Passed to the call of ``placement_group()``, if applicable.
+    """
+    def __init__(
+        self,
+        bundles: List[Dict[str, Union[int, float]]],
+        strategy: str = "PACK",
+        *args,
+        **kwargs,
+    ):
+        if not bundles:
+            raise ValueError("Cannot initialize a ResourceRequest with zero bundles.")
+        # Remove empty resource keys
+        self._bundles = [
+            {k: float(v) for k, v in bundle.items() if v != 0} for bundle in bundles
+        ]
+        # Check if the head bundle is empty (no resources defined or all resources
+        # are 0 (and thus removed in the previous step)
+        if not self._bundles[0]:
+            # This is when the head bundle doesn't need resources.
+            self._head_bundle_is_empty = True
+            self._bundles.pop(0)
+            if not self._bundles:
+                raise ValueError(
+                    "Cannot initialize a ResourceRequest with an empty head "
+                    "and zero worker bundles."
+                )
+        else:
+            self._head_bundle_is_empty = False
+        self._strategy = strategy
+        self._args = args
+        self._kwargs = kwargs
+        self._hash = None
+        self._bound = None
+        self._bind()
+    @property
+    def head_bundle_is_empty(self):
+        """Returns True if head bundle is empty while child bundles
+        need resources.
+        This is considered an internal API within Tune.
+        """
+        return self._head_bundle_is_empty
+    @property
+    @DeveloperAPI
+    def head_cpus(self) -> float:
+        """Returns the number of cpus in the head bundle."""
+        return 0.0 if self._head_bundle_is_empty else self._bundles[0].get("CPU", 0.0)
+    @property
+    @DeveloperAPI
+    def bundles(self) -> List[Dict[str, float]]:
+        """Returns a deep copy of resource bundles"""
+        return deepcopy(self._bundles)
+    @property
+    def required_resources(self) -> Dict[str, float]:
+        """Returns a dict containing the sums of all resources"""
+        return _sum_bundles(self._bundles)
+    @property
+    @DeveloperAPI
+    def strategy(self) -> str:
+        """Returns the placement strategy"""
+        return self._strategy
+    def _bind(self):
+        """Bind the args and kwargs to the `placement_group()` signature.
+        We bind the args and kwargs, so we can compare equality of two resource
+        requests. The main reason for this is that the `placement_group()` API
+        can evolve independently from the ResourceRequest API (e.g. adding new
+        arguments). Then, `ResourceRequest(bundles, strategy, arg=arg)` should
+        be the same as `ResourceRequest(bundles, strategy, arg)`.
+        """
+        sig = signature(placement_group)
+        try:
+            self._bound = sig.bind(
+                self._bundles, self._strategy, *self._args, **self._kwargs
+            )
+        except Exception as exc:
+            raise RuntimeError(
+                "Invalid definition for resource request. Please check "
+                "that you passed valid arguments to the ResourceRequest "
+                "object."
+            ) from exc
+    def to_placement_group(self):
+        return placement_group(*self._bound.args, **self._bound.kwargs)
+    def __eq__(self, other: "ResourceRequest"):
+        return (
+            isinstance(other, ResourceRequest)
+            and self._bound == other._bound
+            and self.head_bundle_is_empty == other.head_bundle_is_empty
+        )
+    def __hash__(self):
+        if not self._hash:
+            # Cache hash
+            self._hash = hash(
+                json.dumps(
+                    {"args": self._bound.args, "kwargs": self._bound.kwargs},
+                    sort_keys=True,
+                    indent=0,
+                    ensure_ascii=True,
+                )
+            )
+        return self._hash
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state.pop("_hash", None)
+        state.pop("_bound", None)
+        return state
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._hash = None
+        self._bound = None
+        self._bind()
+    def __repr__(self) -> str:
+        return (
+            f"<ResourceRequest (_bound={self._bound}, "
+            f"head_bundle_is_empty={self.head_bundle_is_empty})>"
+        )
+@DeveloperAPI
+@dataclass
+class AcquiredResources(abc.ABC):
+    """Base class for resources that have been acquired.
+    Acquired resources can be associated to Ray objects, which can then be
+    scheduled using these resources.
+    Internally this can point e.g. to a placement group, a placement
+    group bundle index, or just raw resources.
+    The main API is the `annotate_remote_entities` method. This will associate
+    remote Ray objects (tasks and actors) with the acquired resources by setting
+    the Ray remote options to use the acquired resources.
+    """
+    resource_request: ResourceRequest
+    def annotate_remote_entities(
+        self, entities: List[RemoteRayEntity]
+    ) -> List[Union[RemoteRayEntity]]:
+        """Return remote ray entities (tasks/actors) to use the acquired resources.
+        The first entity will be associated with the first bundle, the second
+        entity will be associated with the second bundle, etc.
+        Args:
+            entities: Remote Ray entities to annotate with the acquired resources.
+        """
+        bundles = self.resource_request.bundles
+        # Also count the empty head bundle as a bundle
+        num_bundles = len(bundles) + int(self.resource_request.head_bundle_is_empty)
+        if len(entities) > num_bundles:
+            raise RuntimeError(
+                f"The number of callables to annotate ({len(entities)}) cannot "
+                f"exceed the number of available bundles ({num_bundles})."
+            )
+        annotated = []
+        if self.resource_request.head_bundle_is_empty:
+            # The empty head bundle is place on the first bundle index with empty
+            # resources.
+            annotated.append(
+                self._annotate_remote_entity(entities[0], {}, bundle_index=0)
+            )
+            # Shift the remaining entities
+            entities = entities[1:]
+        for i, (entity, bundle) in enumerate(zip(entities, bundles)):
+            annotated.append(
+                self._annotate_remote_entity(entity, bundle, bundle_index=i)
+            )
+        return annotated
+    def _annotate_remote_entity(
+        self, entity: RemoteRayEntity, bundle: Dict[str, float], bundle_index: int
+    ) -> RemoteRayEntity:
+        raise NotImplementedError

.venv/lib/python3.11/site-packages/ray/air/execution/resources/resource_manager.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import abc
+from typing import List, Optional
+import ray
+from ray.air.execution.resources.request import AcquiredResources, ResourceRequest
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+class ResourceManager(abc.ABC):
+    """Resource manager interface.
+    A resource manager can be used to request resources from a Ray cluster and
+    allocate them to remote Ray tasks or actors.
+    Resources have to be requested before they can be acquired.
+    Resources managed by the resource manager can be in three states:
+    1. "Requested":  The resources have been requested but are not yet available to
+       schedule remote Ray objects. The resource request may trigger autoscaling,
+       and can be cancelled if no longer needed.
+    2. "Ready": The requested resources are now available to schedule remote Ray
+       objects. They can be acquired and subsequently used remote Ray objects.
+       The resource request can still be cancelled if no longer needed.
+    3. "Acquired": The resources have been acquired by a caller to use for scheduling
+       remote Ray objects. Note that it is the responsibility of the caller to
+       schedule the Ray objects with these resources.
+       The associated resource request has been completed and can no longer be
+       cancelled. The acquired resources can be freed by the resource manager when
+       they are no longer used.
+    The flow is as follows:
+    .. code-block:: python
+        # Create resource manager
+        resource_manager = ResourceManager()
+        # Create resource request
+        resource_request = ResourceRequest([{"CPU": 4}])
+        # Pass to resource manager
+        resource_manager.request_resources(resource_request)
+        # Wait until ready
+        while not resource_manager.has_resources_ready(resource_request):
+            time.sleep(1)
+        # Once ready, acquire resources
+        acquired_resource = resource_manager.acquire_resources(resource_request)
+        # Bind to remote task or actor
+        annotated_remote_fn = acquired_resource.annotate_remote_entities(
+            [remote_fn])
+        # Run remote function. This will use the acquired resources
+        ray.get(annotated_remote_fn.remote())
+        # After using the resources, free
+        resource_manager.free_resources(annotated_resources)
+    """
+    def request_resources(self, resource_request: ResourceRequest):
+        """Request resources.
+        Depending on the backend, resources can trigger autoscaling. Requested
+        resources can be ready or not ready. Once they are "ready", they can
+        be acquired and used by remote Ray objects.
+        Resource requests can be cancelled anytime using ``cancel_resource_request()``.
+        Once acquired, the resource request is removed. Acquired resources can be
+        freed with ``free_resources()``.
+        """
+        raise NotImplementedError
+    def cancel_resource_request(self, resource_request: ResourceRequest):
+        """Cancel resource request.
+        Resource requests can be cancelled anytime before a resource is acquired.
+        Acquiring a resource will remove the associated resource request.
+        Acquired resources can be freed with ``free_resources()``.
+        """
+        raise NotImplementedError
+    def has_resources_ready(self, resource_request: ResourceRequest) -> bool:
+        """Returns True if resources for the given request are ready to be acquired."""
+        raise NotImplementedError
+    def acquire_resources(
+        self, resource_request: ResourceRequest
+    ) -> Optional[AcquiredResources]:
+        """Acquire resources. Returns None if resources are not ready to be acquired.
+        Acquiring resources will remove the associated resource request.
+        Acquired resources can be returned with ``free_resources()``.
+        """
+        raise NotImplementedError
+    def free_resources(self, acquired_resource: AcquiredResources):
+        """Free acquired resources from usage and return them to the resource manager.
+        Freeing resources will return the resources to the manager, but there are
+        no guarantees about the tasks and actors scheduled on the resources. The caller
+        should make sure that any references to tasks or actors scheduled on the
+        resources have been removed before calling ``free_resources()``.
+        """
+        raise NotImplementedError
+    def get_resource_futures(self) -> List[ray.ObjectRef]:
+        """Return futures for resources to await.
+        Depending on the backend, we use resource futures to determine availability
+        of resources (e.g. placement groups) or resolution of requests.
+        In this case, the futures can be awaited externally by the caller.
+        When a resource future resolved, the caller may call ``update_state()``
+        to force the resource manager to update its internal state immediately.
+        """
+        return []
+    def update_state(self):
+        """Update internal state of the resource manager.
+        The resource manager may have internal state that needs periodic updating.
+        For instance, depending on the backend, resource futures can be awaited
+        externally (with ``get_resource_futures()``).
+        If such a future resolved, the caller can instruct the resource
+        manager to update its internal state immediately.
+        """
+        pass
+    def clear(self):
+        """Reset internal state and clear all resources.
+        Calling this method will reset the resource manager to its initialization state.
+        All resources will be removed.
+        Clearing the state will remove tracked resources from the manager, but there are
+        no guarantees about the tasks and actors scheduled on the resources. The caller
+        should make sure that any references to tasks or actors scheduled on the
+        resources have been removed before calling ``clear()``.
+        """
+        raise NotImplementedError
+    def __reduce__(self):
+        """We disallow serialization.
+        Shared resource managers should live on an actor.
+        """
+        raise ValueError(
+            f"Resource managers cannot be serialized. Resource manager: {str(self)}"
+        )

.venv/lib/python3.11/site-packages/ray/air/result.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import io
+import json
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+import pandas as pd
+import pyarrow
+import ray
+from ray.air.constants import (
+    EXPR_ERROR_PICKLE_FILE,
+    EXPR_PROGRESS_FILE,
+    EXPR_RESULT_FILE,
+)
+from ray.util.annotations import PublicAPI
+if TYPE_CHECKING:
+    from ray.train import Checkpoint
+logger = logging.getLogger(__name__)
+@PublicAPI(stability="stable")
+@dataclass
+class Result:
+    """The final result of a ML training run or a Tune trial.
+    This is the output produced by ``Trainer.fit``.
+    ``Tuner.fit`` outputs a :class:`~ray.tune.ResultGrid` that is a collection
+    of ``Result`` objects.
+    This API is the recommended way to access the outputs such as:
+    - checkpoints (``Result.checkpoint``)
+    - the history of reported metrics (``Result.metrics_dataframe``, ``Result.metrics``)
+    - errors encountered during a training run (``Result.error``)
+    The constructor is a private API -- use ``Result.from_path`` to create a result
+    object from a directory.
+    Attributes:
+        metrics: The latest set of reported metrics.
+        checkpoint: The latest checkpoint.
+        error: The execution error of the Trainable run, if the trial finishes in error.
+        path: Path pointing to the result directory on persistent storage. This can
+            point to a remote storage location (e.g. S3) or to a local location (path
+            on the head node). The path is accessible via the result's associated
+            `filesystem`. For instance, for a result stored in S3 at
+            ``s3://bucket/location``, ``path`` will have the value ``bucket/location``.
+        metrics_dataframe: The full result dataframe of the Trainable.
+            The dataframe is indexed by iterations and contains reported
+            metrics. Note that the dataframe columns are indexed with the
+            *flattened* keys of reported metrics, so the format of this dataframe
+            may be slightly different than ``Result.metrics``, which is an unflattened
+            dict of the latest set of reported metrics.
+        best_checkpoints: A list of tuples of the best checkpoints and
+            their associated metrics. The number of
+            saved checkpoints is determined by :class:`~ray.train.CheckpointConfig`
+            (by default, all checkpoints will be saved).
+    """
+    metrics: Optional[Dict[str, Any]]
+    checkpoint: Optional["Checkpoint"]
+    error: Optional[Exception]
+    path: str
+    metrics_dataframe: Optional["pd.DataFrame"] = None
+    best_checkpoints: Optional[List[Tuple["Checkpoint", Dict[str, Any]]]] = None
+    _storage_filesystem: Optional[pyarrow.fs.FileSystem] = None
+    _items_to_repr = ["error", "metrics", "path", "filesystem", "checkpoint"]
+    @property
+    def config(self) -> Optional[Dict[str, Any]]:
+        """The config associated with the result."""
+        if not self.metrics:
+            return None
+        return self.metrics.get("config", None)
+    @property
+    def filesystem(self) -> pyarrow.fs.FileSystem:
+        """Return the filesystem that can be used to access the result path.
+        Returns:
+            pyarrow.fs.FileSystem implementation.
+        """
+        return self._storage_filesystem or pyarrow.fs.LocalFileSystem()
+    def _repr(self, indent: int = 0) -> str:
+        """Construct the representation with specified number of space indent."""
+        from ray.tune.experimental.output import BLACKLISTED_KEYS
+        from ray.tune.result import AUTO_RESULT_KEYS
+        shown_attributes = {k: getattr(self, k) for k in self._items_to_repr}
+        if self.error:
+            shown_attributes["error"] = type(self.error).__name__
+        else:
+            shown_attributes.pop("error")
+        shown_attributes["filesystem"] = shown_attributes["filesystem"].type_name
+        if self.metrics:
+            exclude = set(AUTO_RESULT_KEYS)
+            exclude.update(BLACKLISTED_KEYS)
+            shown_attributes["metrics"] = {
+                k: v for k, v in self.metrics.items() if k not in exclude
+            }
+        cls_indent = " " * indent
+        kws_indent = " " * (indent + 2)
+        kws = [
+            f"{kws_indent}{key}={value!r}" for key, value in shown_attributes.items()
+        ]
+        kws_repr = ",\n".join(kws)
+        return "{0}{1}(\n{2}\n{0})".format(cls_indent, type(self).__name__, kws_repr)
+    def __repr__(self) -> str:
+        return self._repr(indent=0)
+    @staticmethod
+    def _read_file_as_str(
+        storage_filesystem: pyarrow.fs.FileSystem,
+        storage_path: str,
+    ) -> str:
+        """Opens a file as an input stream reading all byte content sequentially and
+         decoding read bytes as utf-8 string.
+        Args:
+            storage_filesystem: The filesystem to use.
+            storage_path: The source to open for reading.
+        """
+        with storage_filesystem.open_input_stream(storage_path) as f:
+            return f.readall().decode()
+    @classmethod
+    def from_path(
+        cls,
+        path: Union[str, os.PathLike],
+        storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+    ) -> "Result":
+        """Restore a Result object from local or remote trial directory.
+        Args:
+            path: A path of a trial directory on local or remote storage
+                (ex: s3://bucket/path or /tmp/ray_results).
+            storage_filesystem: A custom filesystem to use. If not provided,
+                this will be auto-resolved by pyarrow. If provided, the path
+                is assumed to be prefix-stripped already, and must be a valid path
+                on the filesystem.
+        Returns:
+            A :py:class:`Result` object of that trial.
+        """
+        # TODO(justinvyu): Fix circular dependency.
+        from ray.train import Checkpoint
+        from ray.train._internal.storage import (
+            _exists_at_fs_path,
+            _list_at_fs_path,
+            get_fs_and_path,
+        )
+        from ray.train.constants import CHECKPOINT_DIR_NAME
+        fs, fs_path = get_fs_and_path(path, storage_filesystem)
+        if not _exists_at_fs_path(fs, fs_path):
+            raise RuntimeError(f"Trial folder {fs_path} doesn't exist!")
+        # Restore metrics from result.json
+        result_json_file = Path(fs_path, EXPR_RESULT_FILE).as_posix()
+        progress_csv_file = Path(fs_path, EXPR_PROGRESS_FILE).as_posix()
+        if _exists_at_fs_path(fs, result_json_file):
+            lines = cls._read_file_as_str(fs, result_json_file).split("\n")
+            json_list = [json.loads(line) for line in lines if line]
+            metrics_df = pd.json_normalize(json_list, sep="/")
+            latest_metrics = json_list[-1] if json_list else {}
+        # Fallback to restore from progress.csv
+        elif _exists_at_fs_path(fs, progress_csv_file):
+            metrics_df = pd.read_csv(
+                io.StringIO(cls._read_file_as_str(fs, progress_csv_file))
+            )
+            latest_metrics = (
+                metrics_df.iloc[-1].to_dict() if not metrics_df.empty else {}
+            )
+        else:
+            raise RuntimeError(
+                f"Failed to restore the Result object: Neither {EXPR_RESULT_FILE}"
+                f" nor {EXPR_PROGRESS_FILE} exists in the trial folder!"
+            )
+        # Restore all checkpoints from the checkpoint folders
+        checkpoint_dir_names = sorted(
+            _list_at_fs_path(
+                fs,
+                fs_path,
+                file_filter=lambda file_info: file_info.type
+                == pyarrow.fs.FileType.Directory
+                and file_info.base_name.startswith("checkpoint_"),
+            )
+        )
+        if checkpoint_dir_names:
+            checkpoints = [
+                Checkpoint(
+                    path=Path(fs_path, checkpoint_dir_name).as_posix(), filesystem=fs
+                )
+                for checkpoint_dir_name in checkpoint_dir_names
+            ]
+            metrics = []
+            for checkpoint_dir_name in checkpoint_dir_names:
+                metrics_corresponding_to_checkpoint = metrics_df[
+                    metrics_df[CHECKPOINT_DIR_NAME] == checkpoint_dir_name
+                ]
+                if metrics_corresponding_to_checkpoint.empty:
+                    logger.warning(
+                        "Could not find metrics corresponding to "
+                        f"{checkpoint_dir_name}. These will default to an empty dict."
+                    )
+                metrics.append(
+                    {}
+                    if metrics_corresponding_to_checkpoint.empty
+                    else metrics_corresponding_to_checkpoint.iloc[-1].to_dict()
+                )
+            latest_checkpoint = checkpoints[-1]
+            # TODO(justinvyu): These are ordered by checkpoint index, since we don't
+            # know the metric to order these with.
+            best_checkpoints = list(zip(checkpoints, metrics))
+        else:
+            best_checkpoints = latest_checkpoint = None
+        # Restore the trial error if it exists
+        error = None
+        error_file_path = Path(fs_path, EXPR_ERROR_PICKLE_FILE).as_posix()
+        if _exists_at_fs_path(fs, error_file_path):
+            with fs.open_input_stream(error_file_path) as f:
+                error = ray.cloudpickle.load(f)
+        return Result(
+            metrics=latest_metrics,
+            checkpoint=latest_checkpoint,
+            path=fs_path,
+            _storage_filesystem=fs,
+            metrics_dataframe=metrics_df,
+            best_checkpoints=best_checkpoints,
+            error=error,
+        )
+    @PublicAPI(stability="alpha")
+    def get_best_checkpoint(self, metric: str, mode: str) -> Optional["Checkpoint"]:
+        """Get the best checkpoint from this trial based on a specific metric.
+        Any checkpoints without an associated metric value will be filtered out.
+        Args:
+            metric: The key for checkpoints to order on.
+            mode: One of ["min", "max"].
+        Returns:
+            :class:`Checkpoint <ray.train.Checkpoint>` object, or None if there is
+            no valid checkpoint associated with the metric.
+        """
+        if not self.best_checkpoints:
+            raise RuntimeError("No checkpoint exists in the trial directory!")
+        if mode not in ["max", "min"]:
+            raise ValueError(
+                f'Unsupported mode: {mode}. Please choose from ["min", "max"]!'
+            )
+        op = max if mode == "max" else min
+        valid_checkpoints = [
+            ckpt_info for ckpt_info in self.best_checkpoints if metric in ckpt_info[1]
+        ]
+        if not valid_checkpoints:
+            raise RuntimeError(
+                f"Invalid metric name {metric}! "
+                f"You may choose from the following metrics: {self.metrics.keys()}."
+            )
+        return op(valid_checkpoints, key=lambda x: x[1][metric])[0]

.venv/lib/python3.11/site-packages/ray/air/session.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from ray.train._internal.session import * # noqa: F401,F403

.venv/lib/python3.11/site-packages/ray/air/util/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/pandas.py ADDED Viewed

	@@ -0,0 +1,1451 @@

+# Adapted from
+# https://github.com/CODAIT/text-extensions-for-pandas/blob/dc03278689fe1c5f131573658ae19815ba25f33e/text_extensions_for_pandas/array/tensor.py
+# and
+# https://github.com/CODAIT/text-extensions-for-pandas/blob/dc03278689fe1c5f131573658ae19815ba25f33e/text_extensions_for_pandas/array/arrow_conversion.py
+#
+#  Copyright (c) 2020 IBM Corp.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+# Modifications:
+# - Added ArrowTensorType.to_pandas_type()
+# - Added ArrowTensorArray.__getitem__()
+# - Added ArrowTensorArray.__iter__()
+# - Added support for column casts to extension types.
+# - Fleshed out docstrings and examples.
+# - Fixed TensorArray.isna() so it returns an appropriate ExtensionArray.
+# - Added different (more vectorized) TensorArray.take() operation.
+# - Added support for more reducers (agg funcs) to TensorArray.
+# - Added support for logical operators to TensorArray(Element).
+# - Added support for heterogeneously-shaped tensors.
+# - Miscellaneous small bug fixes and optimizations.
+import numbers
+import os
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+from packaging.version import Version
+from pandas._typing import Dtype
+from pandas.compat import set_function_name
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+from pandas.core.indexers import check_array_indexer, validate_indices
+from ray.air.util.tensor_extensions.utils import (
+    _create_possibly_ragged_ndarray,
+    _is_ndarray_variable_shaped_tensor,
+)
+from ray.util.annotations import PublicAPI
+try:
+    from pandas.core.dtypes.generic import ABCIndex
+except ImportError:
+    # ABCIndexClass changed to ABCIndex in Pandas 1.3
+    from pandas.core.dtypes.generic import ABCIndexClass as ABCIndex
+#############################################
+# Begin patching of ExtensionArrayFormatter #
+#############################################
+def _format_strings_patched(self) -> List[str]:
+    from pandas.core.construction import extract_array
+    from pandas.io.formats.format import format_array
+    if not isinstance(self.values, TensorArray):
+        return self._format_strings_orig()
+    values = extract_array(self.values, extract_numpy=True)
+    array = np.asarray(values)
+    if array.ndim == 1:
+        return self._format_strings_orig()
+    def format_array_wrap(array_, formatter_):
+        fmt_values = format_array(
+            array_,
+            formatter_,
+            float_format=self.float_format,
+            na_rep=self.na_rep,
+            digits=self.digits,
+            space=self.space,
+            justify=self.justify,
+            decimal=self.decimal,
+            leading_space=self.leading_space,
+            quoting=self.quoting,
+        )
+        return fmt_values
+    flat_formatter = self.formatter
+    if flat_formatter is None:
+        flat_formatter = values._formatter(boxed=True)
+    # Flatten array, call function, reshape (use ravel_compat in v1.3.0)
+    flat_array = array.ravel("K")
+    fmt_flat_array = np.asarray(format_array_wrap(flat_array, flat_formatter))
+    order = "F" if array.flags.f_contiguous else "C"
+    fmt_array = fmt_flat_array.reshape(array.shape, order=order)
+    # Format the array of nested strings, use default formatter
+    return format_array_wrap(fmt_array, None)
+def _format_strings_patched_v1_0_0(self) -> List[str]:
+    from functools import partial
+    from pandas.core.construction import extract_array
+    from pandas.io.formats.format import format_array
+    from pandas.io.formats.printing import pprint_thing
+    if not isinstance(self.values, TensorArray):
+        return self._format_strings_orig()
+    values = extract_array(self.values, extract_numpy=True)
+    array = np.asarray(values)
+    if array.ndim == 1:
+        return self._format_strings_orig()
+    def format_array_wrap(array_, formatter_):
+        fmt_values = format_array(
+            array_,
+            formatter_,
+            float_format=self.float_format,
+            na_rep=self.na_rep,
+            digits=self.digits,
+            space=self.space,
+            justify=self.justify,
+            decimal=self.decimal,
+            leading_space=self.leading_space,
+        )
+        return fmt_values
+    flat_formatter = self.formatter
+    if flat_formatter is None:
+        flat_formatter = values._formatter(boxed=True)
+    # Flatten array, call function, reshape (use ravel_compat in v1.3.0)
+    flat_array = array.ravel("K")
+    fmt_flat_array = np.asarray(format_array_wrap(flat_array, flat_formatter))
+    order = "F" if array.flags.f_contiguous else "C"
+    fmt_array = fmt_flat_array.reshape(array.shape, order=order)
+    # Slimmed down version of GenericArrayFormatter due to:
+    # https://github.com/pandas-dev/pandas/issues/33770
+    def format_strings_slim(array_, leading_space):
+        formatter = partial(
+            pprint_thing,
+            escape_chars=("\t", "\r", "\n"),
+        )
+        def _format(x):
+            return str(formatter(x))
+        fmt_values = []
+        for v in array_:
+            tpl = "{v}" if leading_space is False else " {v}"
+            fmt_values.append(tpl.format(v=_format(v)))
+        return fmt_values
+    return format_strings_slim(fmt_array, self.leading_space)
+_FORMATTER_ENABLED_ENV_VAR = "TENSOR_COLUMN_EXTENSION_FORMATTER_ENABLED"
+if os.getenv(_FORMATTER_ENABLED_ENV_VAR, "1") == "1":
+    if Version(pd.__version__) < Version("2.2.0"):
+        from pandas.io.formats.format import ExtensionArrayFormatter
+        formatter_cls = ExtensionArrayFormatter
+    else:
+        from pandas.io.formats.format import _ExtensionArrayFormatter
+        formatter_cls = _ExtensionArrayFormatter
+    formatter_cls._format_strings_orig = formatter_cls._format_strings
+    if Version("1.1.0") <= Version(pd.__version__) < Version("1.3.0"):
+        formatter_cls._format_strings = _format_strings_patched
+    else:
+        formatter_cls._format_strings = _format_strings_patched_v1_0_0
+    formatter_cls._patched_by_ray_datasets = True
+###########################################
+# End patching of ExtensionArrayFormatter #
+###########################################
+@PublicAPI(stability="beta")
+@pd.api.extensions.register_extension_dtype
+class TensorDtype(pd.api.extensions.ExtensionDtype):
+    """
+    Pandas extension type for a column of homogeneous-typed tensors.
+    This extension supports tensors in which the elements have different shapes.
+    However, each tensor element must be non-ragged, i.e. each tensor element must have
+    a well-defined, non-ragged shape.
+    See:
+    https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py
+    for up-to-date interface documentation and the subclassing contract. The
+    docstrings of the below properties and methods were copied from the base
+    ExtensionDtype.
+    Examples:
+        >>> # Create a DataFrame with a list of ndarrays as a column.
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> import ray
+        >>> df = pd.DataFrame({
+        ...     "one": [1, 2, 3],
+        ...     "two": list(np.arange(24).reshape((3, 2, 2, 2)))})
+        >>> # Note the opaque np.object dtype for this column.
+        >>> df.dtypes # doctest: +SKIP
+        one     int64
+        two    object
+        dtype: object
+        >>> # Cast column to our TensorDtype extension type.
+        >>> from ray.data.extensions import TensorDtype
+        >>> df["two"] = df["two"].astype(TensorDtype(np.int64, (3, 2, 2, 2)))
+        >>> # Note that the column dtype is now TensorDtype instead of
+        >>> # np.object.
+        >>> df.dtypes # doctest: +SKIP
+        one          int64
+        two    TensorDtype(shape=(3, 2, 2, 2), dtype=int64)
+        dtype: object
+        >>> # Pandas is now aware of this tensor column, and we can do the
+        >>> # typical DataFrame operations on this column.
+        >>> col = 2 * (df["two"] + 10)
+        >>> # The ndarrays underlying the tensor column will be manipulated,
+        >>> # but the column itself will continue to be a Pandas type.
+        >>> type(col) # doctest: +SKIP
+        pandas.core.series.Series
+        >>> col # doctest: +SKIP
+        0   [[[ 2  4]
+              [ 6  8]]
+             [[10 12]
+               [14 16]]]
+        1   [[[18 20]
+              [22 24]]
+             [[26 28]
+              [30 32]]]
+        2   [[[34 36]
+              [38 40]]
+             [[42 44]
+              [46 48]]]
+        Name: two, dtype: TensorDtype(shape=(3, 2, 2, 2), dtype=int64)
+        >>> # Once you do an aggregation on that column that returns a single
+        >>> # row's value, you get back our TensorArrayElement type.
+        >>> tensor = col.mean()
+        >>> type(tensor) # doctest: +SKIP
+        ray.data.extensions.tensor_extension.TensorArrayElement
+        >>> tensor # doctest: +SKIP
+        array([[[18., 20.],
+                [22., 24.]],
+               [[26., 28.],
+                [30., 32.]]])
+        >>> # This is a light wrapper around a NumPy ndarray, and can easily
+        >>> # be converted to an ndarray.
+        >>> type(tensor.to_numpy()) # doctest: +SKIP
+        numpy.ndarray
+        >>> # In addition to doing Pandas operations on the tensor column,
+        >>> # you can now put the DataFrame into a Dataset.
+        >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
+        >>> # Internally, this column is represented the corresponding
+        >>> # Arrow tensor extension type.
+        >>> ds.schema() # doctest: +SKIP
+        one: int64
+        two: extension<arrow.py_extension_type<ArrowTensorType>>
+        >>> # You can write the dataset to Parquet.
+        >>> ds.write_parquet("/some/path") # doctest: +SKIP
+        >>> # And you can read it back.
+        >>> read_ds = ray.data.read_parquet("/some/path") # doctest: +SKIP
+        >>> read_ds.schema() # doctest: +SKIP
+        one: int64
+        two: extension<arrow.py_extension_type<ArrowTensorType>>
+        >>> read_df = ray.get(read_ds.to_pandas_refs())[0] # doctest: +SKIP
+        >>> read_df.dtypes # doctest: +SKIP
+        one          int64
+        two    TensorDtype(shape=(3, 2, 2, 2), dtype=int64)
+        dtype: object
+        >>> # The tensor extension type is preserved along the
+        >>> # Pandas --> Arrow --> Parquet --> Arrow --> Pandas
+        >>> # conversion chain.
+        >>> read_df.equals(df) # doctest: +SKIP
+        True
+    """
+    # NOTE(Clark): This is apparently required to prevent integer indexing
+    # errors, but is an undocumented ExtensionDtype attribute. See issue:
+    # https://github.com/CODAIT/text-extensions-for-pandas/issues/166
+    base = None
+    def __init__(self, shape: Tuple[Optional[int], ...], dtype: np.dtype):
+        self._shape = shape
+        self._dtype = dtype
+    @property
+    def type(self):
+        """
+        The scalar type for the array, e.g. ``int``
+        It's expected ``ExtensionArray[item]`` returns an instance
+        of ``ExtensionDtype.type`` for scalar ``item``, assuming
+        that value is valid (not NA). NA values do not need to be
+        instances of `type`.
+        """
+        return TensorArrayElement
+    @property
+    def element_dtype(self):
+        """
+        The dtype of the underlying tensor elements.
+        """
+        return self._dtype
+    @property
+    def element_shape(self):
+        """
+        The shape of the underlying tensor elements. This will be a tuple of Nones if
+        the corresponding TensorArray for this TensorDtype holds variable-shaped tensor
+        elements.
+        """
+        return self._shape
+    @property
+    def is_variable_shaped(self):
+        """
+        Whether the corresponding TensorArray for this TensorDtype holds variable-shaped
+        tensor elements.
+        """
+        return all(dim_size is None for dim_size in self.shape)
+    @property
+    def name(self) -> str:
+        """
+        A string identifying the data type.
+        Will be used for display in, e.g. ``Series.dtype``
+        """
+        return f"numpy.ndarray(shape={self._shape}, dtype={self._dtype})"
+    @classmethod
+    def construct_from_string(cls, string: str):
+        r"""
+        Construct this type from a string.
+        This is useful mainly for data types that accept parameters.
+        For example, a period dtype accepts a frequency parameter that
+        can be set as ``period[H]`` (where H means hourly frequency).
+        By default, in the abstract class, just the name of the type is
+        expected. But subclasses can overwrite this method to accept
+        parameters.
+        Parameters
+        ----------
+        string : str
+            The name of the type, for example ``category``.
+        Returns
+        -------
+        ExtensionDtype
+            Instance of the dtype.
+        Raises
+        ------
+        TypeError
+            If a class cannot be constructed from this 'string'.
+        Examples
+        --------
+        For extension dtypes with arguments the following may be an
+        adequate implementation.
+        >>> import re
+        >>> @classmethod
+        ... def construct_from_string(cls, string):
+        ...     pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$")
+        ...     match = pattern.match(string)
+        ...     if match:
+        ...         return cls(**match.groupdict())
+        ...     else:
+        ...         raise TypeError(
+        ...             f"Cannot construct a '{cls.__name__}' from '{string}'"
+        ...         )
+        """
+        import ast
+        import re
+        if not isinstance(string, str):
+            raise TypeError(
+                f"'construct_from_string' expects a string, got {type(string)}"
+            )
+        # Upstream code uses exceptions as part of its normal control flow and
+        # will pass this method bogus class names.
+        regex = (
+            r"^(TensorDtype|numpy.ndarray)"
+            r"\(shape=(\((?:(?:\d+|None),?\s?)*\)), dtype=(\w+)\)$"
+        )
+        m = re.search(regex, string)
+        err_msg = (
+            f"Cannot construct a '{cls.__name__}' from '{string}'; expected a string "
+            "like 'TensorDtype(shape=(1, 2, 3), dtype=int64)'."
+        )
+        if m is None:
+            raise TypeError(err_msg)
+        groups = m.groups()
+        if len(groups) != 3:
+            raise TypeError(err_msg)
+        _, shape, dtype = groups
+        shape = ast.literal_eval(shape)
+        dtype = np.dtype(dtype)
+        return cls(shape, dtype)
+    @classmethod
+    def construct_array_type(cls):
+        """
+        Return the array type associated with this dtype.
+        Returns
+        -------
+        type
+        """
+        return TensorArray
+    def __from_arrow__(self, array: Union[pa.Array, pa.ChunkedArray]):
+        """
+        Convert a pyarrow (chunked) array to a TensorArray.
+        This and TensorArray.__arrow_array__ make up the
+        Pandas extension type + array <--> Arrow extension type + array
+        interoperability protocol. See
+        https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
+        for more information.
+        """
+        if isinstance(array, pa.ChunkedArray):
+            if array.num_chunks > 1:
+                # TODO(Clark): Remove concat and construct from list with
+                # shape.
+                values = np.concatenate(
+                    [chunk.to_numpy() for chunk in array.iterchunks()]
+                )
+            else:
+                values = array.chunk(0).to_numpy()
+        else:
+            values = array.to_numpy()
+        return TensorArray(values)
+    def __str__(self) -> str:
+        return self.name
+    def __repr__(self) -> str:
+        return str(self)
+    @property
+    def _is_boolean(self):
+        """
+        Whether this extension array should be considered boolean.
+        By default, ExtensionArrays are assumed to be non-numeric.
+        Setting this to True will affect the behavior of several places,
+        e.g.
+        * is_bool
+        * boolean indexing
+        Returns
+        -------
+        bool
+        """
+        # This is needed to support returning a TensorArray from .isnan().
+        from pandas.core.dtypes.common import is_bool_dtype
+        return is_bool_dtype(self._dtype)
+class _TensorOpsMixin(pd.api.extensions.ExtensionScalarOpsMixin):
+    """
+    Mixin for TensorArray operator support, applying operations on the
+    underlying ndarrays.
+    """
+    @classmethod
+    def _create_method(cls, op, coerce_to_dtype=True, result_dtype=None):
+        """
+        Add support for binary operators by unwrapping, applying, and
+        rewrapping.
+        """
+        # NOTE(Clark): This overrides, but coerce_to_dtype, result_dtype might
+        # not be needed
+        def _binop(self, other):
+            lvalues = self._tensor
+            if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndex)):
+                # Rely on Pandas to unbox and dispatch to us.
+                return NotImplemented
+            # divmod returns a tuple
+            if op_name in ["__divmod__", "__rdivmod__"]:
+                # TODO(Clark): Add support for divmod and rdivmod.
+                # div, mod = result
+                raise NotImplementedError
+            if isinstance(other, (TensorArray, TensorArrayElement)):
+                rvalues = other._tensor
+            else:
+                rvalues = other
+            result = op(lvalues, rvalues)
+            # Force a TensorArray if rvalue is not a scalar.
+            if isinstance(self, TensorArrayElement) and (
+                not isinstance(other, TensorArrayElement) or not np.isscalar(other)
+            ):
+                result_wrapped = TensorArray(result)
+            else:
+                result_wrapped = cls(result)
+            return result_wrapped
+        op_name = f"__{op.__name__}__"
+        return set_function_name(_binop, op_name, cls)
+    @classmethod
+    def _create_logical_method(cls, op):
+        return cls._create_method(op)
+class _TensorScalarCastMixin:
+    """
+    Mixin for casting scalar tensors to a particular numeric type.
+    """
+    def _scalarfunc(self, func: Callable[[Any], Any]):
+        return func(self._tensor)
+    def __complex__(self):
+        return self._scalarfunc(complex)
+    def __float__(self):
+        return self._scalarfunc(float)
+    def __int__(self):
+        return self._scalarfunc(int)
+    def __hex__(self):
+        return self._scalarfunc(hex)
+    def __oct__(self):
+        return self._scalarfunc(oct)
+@PublicAPI(stability="beta")
+class TensorArrayElement(_TensorOpsMixin, _TensorScalarCastMixin):
+    """
+    Single element of a TensorArray, wrapping an underlying ndarray.
+    """
+    def __init__(self, values: np.ndarray):
+        """
+        Construct a TensorArrayElement from a NumPy ndarray.
+        Args:
+            values: ndarray that underlies this TensorArray element.
+        """
+        self._tensor = values
+    def __repr__(self):
+        return self._tensor.__repr__()
+    def __str__(self):
+        return self._tensor.__str__()
+    @property
+    def numpy_dtype(self):
+        """
+        Get the dtype of the tensor.
+        :return: The numpy dtype of the backing ndarray
+        """
+        return self._tensor.dtype
+    @property
+    def numpy_ndim(self):
+        """
+        Get the number of tensor dimensions.
+        :return: integer for the number of dimensions
+        """
+        return self._tensor.ndim
+    @property
+    def numpy_shape(self):
+        """
+        Get the shape of the tensor.
+        :return: A tuple of integers for the numpy shape of the backing ndarray
+        """
+        return self._tensor.shape
+    @property
+    def numpy_size(self):
+        """
+        Get the size of the tensor.
+        :return: integer for the number of elements in the tensor
+        """
+        return self._tensor.size
+    def to_numpy(self):
+        """
+        Return the values of this element as a NumPy ndarray.
+        """
+        return np.asarray(self._tensor)
+    def __array__(self, dtype: np.dtype = None, **kwargs) -> np.ndarray:
+        return np.asarray(self._tensor, dtype=dtype, **kwargs)
+@PublicAPI(stability="beta")
+class TensorArray(
+    pd.api.extensions.ExtensionArray,
+    _TensorOpsMixin,
+    _TensorScalarCastMixin,
+):
+    """
+    Pandas `ExtensionArray` representing a tensor column, i.e. a column
+    consisting of ndarrays as elements.
+    This extension supports tensors in which the elements have different shapes.
+    However, each tensor element must be non-ragged, i.e. each tensor element must have
+    a well-defined, non-ragged shape.
+    Examples:
+        >>> # Create a DataFrame with a list of ndarrays as a column.
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> import ray
+        >>> from ray.data.extensions import TensorArray
+        >>> df = pd.DataFrame({
+        ...     "one": [1, 2, 3],
+        ...     "two": TensorArray(np.arange(24).reshape((3, 2, 2, 2)))})
+        >>> # Note that the column dtype is TensorDtype.
+        >>> df.dtypes # doctest: +SKIP
+        one          int64
+        two    TensorDtype(shape=(3, 2, 2, 2), dtype=int64)
+        dtype: object
+        >>> # Pandas is aware of this tensor column, and we can do the
+        >>> # typical DataFrame operations on this column.
+        >>> col = 2 * (df["two"] + 10)
+        >>> # The ndarrays underlying the tensor column will be manipulated,
+        >>> # but the column itself will continue to be a Pandas type.
+        >>> type(col) # doctest: +SKIP
+        pandas.core.series.Series
+        >>> col # doctest: +SKIP
+        0   [[[ 2  4]
+              [ 6  8]]
+             [[10 12]
+               [14 16]]]
+        1   [[[18 20]
+              [22 24]]
+             [[26 28]
+              [30 32]]]
+        2   [[[34 36]
+              [38 40]]
+             [[42 44]
+              [46 48]]]
+        Name: two, dtype: TensorDtype(shape=(3, 2, 2, 2), dtype=int64)
+        >>> # Once you do an aggregation on that column that returns a single
+        >>> # row's value, you get back our TensorArrayElement type.
+        >>> tensor = col.mean() # doctest: +SKIP
+        >>> type(tensor) # doctest: +SKIP
+        ray.data.extensions.tensor_extension.TensorArrayElement
+        >>> tensor # doctest: +SKIP
+        array([[[18., 20.],
+                [22., 24.]],
+               [[26., 28.],
+                [30., 32.]]])
+        >>> # This is a light wrapper around a NumPy ndarray, and can easily
+        >>> # be converted to an ndarray.
+        >>> type(tensor.to_numpy()) # doctest: +SKIP
+        numpy.ndarray
+        >>> # In addition to doing Pandas operations on the tensor column,
+        >>> # you can now put the DataFrame into a Dataset.
+        >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
+        >>> # Internally, this column is represented the corresponding
+        >>> # Arrow tensor extension type.
+        >>> ds.schema() # doctest: +SKIP
+        one: int64
+        two: extension<arrow.py_extension_type<ArrowTensorType>>
+        >>> # You can write the dataset to Parquet.
+        >>> ds.write_parquet("/some/path") # doctest: +SKIP
+        >>> # And you can read it back.
+        >>> read_ds = ray.data.read_parquet("/some/path") # doctest: +SKIP
+        >>> read_ds.schema() # doctest: +SKIP
+        one: int64
+        two: extension<arrow.py_extension_type<ArrowTensorType>>
+        >>> read_df = ray.get(read_ds.to_pandas_refs())[0] # doctest: +SKIP
+        >>> read_df.dtypes # doctest: +SKIP
+        one          int64
+        two    TensorDtype(shape=(3, 2, 2, 2), dtype=int64)
+        dtype: object
+        >>> # The tensor extension type is preserved along the
+        >>> # Pandas --> Arrow --> Parquet --> Arrow --> Pandas
+        >>> # conversion chain.
+        >>> read_df.equals(df) # doctest: +SKIP
+        True
+    """
+    SUPPORTED_REDUCERS = {
+        "sum": np.sum,
+        "all": np.all,
+        "any": np.any,
+        "min": np.min,
+        "max": np.max,
+        "mean": np.mean,
+        "median": np.median,
+        "prod": np.prod,
+        "std": np.std,
+        "var": np.var,
+    }
+    # See https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py
+    # for interface documentation and the subclassing contract.
+    def __init__(
+        self,
+        values: Union[
+            np.ndarray,
+            ABCSeries,
+            Sequence[Union[np.ndarray, TensorArrayElement]],
+            TensorArrayElement,
+            Any,
+        ],
+    ):
+        """
+        Args:
+            values: A NumPy ndarray or sequence of NumPy ndarrays of equal
+                shape.
+        """
+        # Try to convert some well-known objects to ndarrays before handing off to
+        # ndarray handling logic.
+        if isinstance(values, ABCSeries):
+            values = _create_possibly_ragged_ndarray(values)
+        elif isinstance(values, Sequence):
+            values = [
+                np.asarray(v) if isinstance(v, TensorArrayElement) else v
+                for v in values
+            ]
+            values = _create_possibly_ragged_ndarray(values)
+        elif isinstance(values, TensorArrayElement):
+            values = np.array([np.asarray(values)], copy=False)
+        if isinstance(values, np.ndarray):
+            if values.dtype.type is np.object_:
+                if len(values) == 0:
+                    # Tensor is empty, pass through to create empty TensorArray.
+                    pass
+                elif all(
+                    isinstance(v, (np.ndarray, TensorArrayElement, Sequence))
+                    and not isinstance(v, str)
+                    for v in values
+                ):
+                    values = [np.asarray(v) for v in values]
+                    # Try to convert ndarrays of ndarrays/TensorArrayElements with an
+                    # opaque object type to a properly typed ndarray of ndarrays.
+                    values = _create_possibly_ragged_ndarray(values)
+                else:
+                    raise TypeError(
+                        "Expected a well-typed ndarray or an object-typed ndarray of "
+                        "ndarray pointers, but got an object-typed ndarray whose "
+                        f"subndarrays are of type {type(values[0])}."
+                    )
+        elif isinstance(values, TensorArray):
+            raise TypeError("Use the copy() method to create a copy of a TensorArray.")
+        else:
+            raise TypeError(
+                "Expected a numpy.ndarray or sequence of numpy.ndarray, "
+                f"but received {values} of type {type(values).__name__} instead."
+            )
+        assert isinstance(values, np.ndarray)
+        self._tensor = values
+        self._is_variable_shaped = None
+    @classmethod
+    def _from_sequence(
+        cls, scalars, *, dtype: Optional[Dtype] = None, copy: bool = False
+    ):
+        """
+        Construct a new ExtensionArray from a sequence of scalars.
+        Parameters
+        ----------
+        scalars : Sequence
+            Each element will be an instance of the scalar type for this
+            array, ``cls.dtype.type`` or be converted into this type in this
+            method.
+        dtype : dtype, optional
+            Construct for this particular dtype. This should be a Dtype
+            compatible with the ExtensionArray.
+        copy : bool, default False
+            If True, copy the underlying data.
+        Returns
+        -------
+        ExtensionArray
+        """
+        if copy and isinstance(scalars, np.ndarray):
+            scalars = scalars.copy()
+        elif isinstance(scalars, TensorArray):
+            scalars = scalars._tensor.copy() if copy else scalars._tensor
+        return TensorArray(scalars)
+    @classmethod
+    def _from_factorized(
+        cls, values: np.ndarray, original: pd.api.extensions.ExtensionArray
+    ):
+        """
+        Reconstruct an ExtensionArray after factorization.
+        Parameters
+        ----------
+        values : ndarray
+            An integer ndarray with the factorized values.
+        original : ExtensionArray
+            The original ExtensionArray that factorize was called on.
+        See Also
+        --------
+        factorize : Top-level factorize method that dispatches here.
+        ExtensionArray.factorize : Encode the extension array as an enumerated
+            type.
+        """
+        raise NotImplementedError
+    def __getitem__(
+        self, item: Union[int, slice, np.ndarray]
+    ) -> Union["TensorArray", "TensorArrayElement"]:
+        """
+        Select a subset of self.
+        Parameters
+        ----------
+        item : int, slice, or ndarray
+            * int: The position in 'self' to get.
+            * slice: A slice object, where 'start', 'stop', and 'step' are
+              integers or None
+            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
+        Returns
+        -------
+        item : scalar or ExtensionArray
+        Notes
+        -----
+        For scalar ``item``, return a scalar value suitable for the array's
+        type. This should be an instance of ``self.dtype.type``.
+        For slice ``key``, return an instance of ``ExtensionArray``, even
+        if the slice is length 0 or 1.
+        For a boolean mask, return an instance of ``ExtensionArray``, filtered
+        to the values where ``item`` is True.
+        """
+        # Return scalar if single value is selected, a TensorArrayElement for
+        # single array element, or TensorArray for slice.
+        if isinstance(item, int):
+            value = self._tensor[item]
+            if np.isscalar(value):
+                return value
+            else:
+                return TensorArrayElement(value)
+        else:
+            # BEGIN workaround for Pandas issue #42430
+            if isinstance(item, tuple) and len(item) > 1 and item[0] == Ellipsis:
+                if len(item) > 2:
+                    # Hopefully this case is not possible, but can't be sure
+                    raise ValueError(
+                        "Workaround Pandas issue #42430 not "
+                        "implemented for tuple length > 2"
+                    )
+                item = item[1]
+            # END workaround for issue #42430
+            if isinstance(item, TensorArray):
+                item = np.asarray(item)
+            item = check_array_indexer(self, item)
+            return TensorArray(self._tensor[item])
+    def __len__(self) -> int:
+        """
+        Length of this array.
+        Returns
+        -------
+        length : int
+        """
+        return len(self._tensor)
+    @property
+    def dtype(self) -> pd.api.extensions.ExtensionDtype:
+        """
+        An instance of 'ExtensionDtype'.
+        """
+        if self.is_variable_shaped:
+            # A tensor is only considered variable-shaped if it's non-empty, so no
+            # non-empty check is needed here.
+            dtype = self._tensor[0].dtype
+            shape = (None,) * self._tensor[0].ndim
+        else:
+            dtype = self.numpy_dtype
+            shape = self.numpy_shape[1:]
+        return TensorDtype(shape, dtype)
+    @property
+    def is_variable_shaped(self):
+        """
+        Whether this TensorArray holds variable-shaped tensor elements.
+        """
+        if self._is_variable_shaped is None:
+            self._is_variable_shaped = _is_ndarray_variable_shaped_tensor(self._tensor)
+        return self._is_variable_shaped
+    @property
+    def nbytes(self) -> int:
+        """
+        The number of bytes needed to store this object in memory.
+        """
+        return self._tensor.nbytes
+    def isna(self) -> "TensorArray":
+        """
+        A 1-D array indicating if each value is missing.
+        Returns
+        -------
+        na_values : Union[np.ndarray, ExtensionArray]
+            In most cases, this should return a NumPy ndarray. For
+            exceptional cases like ``SparseArray``, where returning
+            an ndarray would be expensive, an ExtensionArray may be
+            returned.
+        Notes
+        -----
+        If returning an ExtensionArray, then
+        * ``na_values._is_boolean`` should be True
+        * `na_values` should implement :func:`ExtensionArray._reduce`
+        * ``na_values.any`` and ``na_values.all`` should be implemented
+        """
+        if self._tensor.dtype.type is np.object_:
+            # Avoid comparing with __eq__ because the elements of the tensor
+            # may do something funny with that operation.
+            return np.array(
+                [self._tensor[i] is None for i in range(len(self))], dtype=bool
+            )
+        elif self._tensor.dtype.type is np.str_:
+            return np.all(self._tensor == "", axis=tuple(range(1, self._tensor.ndim)))
+        else:
+            return np.all(
+                np.isnan(self._tensor), axis=tuple(range(1, self._tensor.ndim))
+            )
+    def take(
+        self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None
+    ) -> "TensorArray":
+        """
+        Take elements from an array.
+        Parameters
+        ----------
+        indices : sequence of int
+            Indices to be taken.
+        allow_fill : bool, default False
+            How to handle negative values in `indices`.
+            * False: negative values in `indices` indicate positional indices
+              from the right (the default). This is similar to
+              :func:`numpy.take`.
+            * True: negative values in `indices` indicate
+              missing values. These values are set to `fill_value`. Any other
+              other negative values raise a ``ValueError``.
+        fill_value : any, optional
+            Fill value to use for NA-indices when `allow_fill` is True.
+            This may be ``None``, in which case the default NA value for
+            the type, ``self.dtype.na_value``, is used.
+            For many ExtensionArrays, there will be two representations of
+            `fill_value`: a user-facing "boxed" scalar, and a low-level
+            physical NA value. `fill_value` should be the user-facing version,
+            and the implementation should handle translating that to the
+            physical version for processing the take if necessary.
+        Returns
+        -------
+        ExtensionArray
+        Raises
+        ------
+        IndexError
+            When the indices are out of bounds for the array.
+        ValueError
+            When `indices` contains negative values other than ``-1``
+            and `allow_fill` is True.
+        See Also
+        --------
+        numpy.take : Take elements from an array along an axis.
+        api.extensions.take : Take elements from an array.
+        Notes
+        -----
+        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
+        ``iloc``, when `indices` is a sequence of values. Additionally,
+        it's called by :meth:`Series.reindex`, or any other method
+        that causes realignment, with a `fill_value`.
+        Examples
+        --------
+        Here's an example implementation, which relies on casting the
+        extension array to object dtype. This uses the helper method
+        :func:`pandas.api.extensions.take`.
+        .. code-block:: python
+           def take(self, indices, allow_fill=False, fill_value=None):
+               from pandas.core.algorithms import take
+               # If the ExtensionArray is backed by an ndarray, then
+               # just pass that here instead of coercing to object.
+               data = self.astype(object)
+               if allow_fill and fill_value is None:
+                   fill_value = self.dtype.na_value
+               # fill value should always be translated from the scalar
+               # type for the array, to the physical storage type for
+               # the data, before passing to take.
+               result = take(data, indices, fill_value=fill_value,
+                             allow_fill=allow_fill)
+               return self._from_sequence(result, dtype=self.dtype)
+        """
+        if allow_fill:
+            # With allow_fill being True, negative values in `indices` indicate
+            # missing values and should be set to `fill_value`.
+            indices = np.asarray(indices, dtype=np.intp)
+            validate_indices(indices, len(self._tensor))
+            # Check if there are missing indices to fill, otherwise we can
+            # delegate to NumPy ndarray .take().
+            has_missing = np.any(indices < 0)
+            if has_missing:
+                if fill_value is None:
+                    fill_value = np.nan
+                # Create an array populated with fill value.
+                values = np.full((len(indices),) + self._tensor.shape[1:], fill_value)
+                # Put tensors at the given positive indices into array.
+                is_nonneg = indices >= 0
+                np.put(values, np.where(is_nonneg)[0], self._tensor[indices[is_nonneg]])
+                return TensorArray(values)
+        # Delegate take to NumPy array.
+        values = self._tensor.take(indices, axis=0)
+        return TensorArray(values)
+    def copy(self) -> "TensorArray":
+        """
+        Return a copy of the array.
+        Returns
+        -------
+        ExtensionArray
+        """
+        # TODO(Clark): Copy cached properties.
+        return TensorArray(self._tensor.copy())
+    @classmethod
+    def _concat_same_type(cls, to_concat: Sequence["TensorArray"]) -> "TensorArray":
+        """
+        Concatenate multiple array of this dtype.
+        Parameters
+        ----------
+        to_concat : sequence of this type
+        Returns
+        -------
+        ExtensionArray
+        """
+        should_flatten = False
+        shape = None
+        for a in to_concat:
+            if shape is None:
+                shape = a.dtype.element_shape
+            if a.is_variable_shaped or a.dtype.element_shape != shape:
+                should_flatten = True
+                break
+        if should_flatten:
+            concated = TensorArray(
+                np.array([e for a in to_concat for e in a._tensor], dtype=object)
+            )
+        else:
+            concated = TensorArray(np.concatenate([a._tensor for a in to_concat]))
+        return concated
+    def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
+        """
+        Set one or more values inplace.
+        This method is not required to satisfy the pandas extension array
+        interface.
+        Parameters
+        ----------
+        key : int, ndarray, or slice
+            When called from, e.g. ``Series.__setitem__``, ``key`` will be
+            one of
+            * scalar int
+            * ndarray of integers.
+            * boolean ndarray
+            * slice object
+        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
+            value or values to be set of ``key``.
+        Returns
+        -------
+        None
+        """
+        key = check_array_indexer(self, key)
+        if isinstance(value, TensorArrayElement) or np.isscalar(value):
+            value = np.asarray(value)
+        if isinstance(value, list):
+            value = [
+                np.asarray(v) if isinstance(v, TensorArrayElement) else v for v in value
+            ]
+        if isinstance(value, ABCSeries) and isinstance(value.dtype, TensorDtype):
+            value = value.values
+        if value is None or isinstance(value, Sequence) and len(value) == 0:
+            self._tensor[key] = np.full_like(self._tensor[key], np.nan)
+        elif isinstance(key, (int, slice, np.ndarray)):
+            self._tensor[key] = value
+        else:
+            raise NotImplementedError(
+                f"__setitem__ with key type '{type(key)}' not implemented"
+            )
+    def __contains__(self, item) -> bool:
+        """
+        Return for `item in self`.
+        """
+        if isinstance(item, TensorArrayElement):
+            np_item = np.asarray(item)
+            if np_item.size == 1 and np.isnan(np_item).all():
+                return self.isna().any()
+        return super().__contains__(item)
+    def __repr__(self):
+        return self._tensor.__repr__()
+    def __str__(self):
+        return self._tensor.__str__()
+    def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
+        # TODO(Clark): return self._tensor, np.nan
+        raise NotImplementedError
+    def _reduce(self, name: str, skipna: bool = True, **kwargs):
+        """
+        Return a scalar result of performing the reduction operation.
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            { any, all, min, max, sum, mean, median, prod,
+            std, var, sem, kurt, skew }.
+        skipna : bool, default True
+            If True, skip NaN values.
+        **kwargs
+            Additional keyword arguments passed to the reduction function.
+            Currently, `ddof` is the only supported kwarg.
+        Returns
+        -------
+        scalar
+        Raises
+        ------
+        TypeError : subclass does not define reductions
+        """
+        supported_kwargs = ["ddof"]
+        reducer_kwargs = {}
+        for kw in supported_kwargs:
+            try:
+                reducer_kwargs[kw] = kwargs[kw]
+            except KeyError:
+                pass
+        try:
+            return TensorArrayElement(
+                self.SUPPORTED_REDUCERS[name](self._tensor, axis=0, **reducer_kwargs)
+            )
+        except KeyError:
+            raise NotImplementedError(f"'{name}' aggregate not implemented.") from None
+    def __array__(self, dtype: np.dtype = None, **kwargs) -> np.ndarray:
+        return np.asarray(self._tensor, dtype=dtype, **kwargs)
+    def __array_ufunc__(self, ufunc: Callable, method: str, *inputs, **kwargs):
+        """
+        Supports NumPy ufuncs without requiring sloppy coercion to an
+        ndarray.
+        """
+        out = kwargs.get("out", ())
+        for x in inputs + out:
+            if not isinstance(x, (TensorArray, np.ndarray, numbers.Number)):
+                return NotImplemented
+        # Defer to the implementation of the ufunc on unwrapped values.
+        inputs = tuple(x._tensor if isinstance(x, TensorArray) else x for x in inputs)
+        if out:
+            kwargs["out"] = tuple(
+                x._tensor if isinstance(x, TensorArray) else x for x in out
+            )
+        result = getattr(ufunc, method)(*inputs, **kwargs)
+        if type(result) is tuple:
+            # Multiple return values.
+            return tuple(type(self)(x) for x in result)
+        elif method == "at":
+            # No return value.
+            return None
+        else:
+            # One return value.
+            return type(self)(result)
+    def to_numpy(
+        self,
+        dtype: np.dtype = None,
+        copy: bool = False,
+        na_value: Any = pd.api.extensions.no_default,
+    ):
+        """
+        Convert to a NumPy ndarray.
+        .. versionadded:: 1.0.0
+        This is similar to :meth:`numpy.asarray`, but may provide additional
+        control over how the conversion is done.
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`.
+        copy : bool, default False
+            Whether to ensure that the returned value is a not a view on
+            another array. Note that ``copy=False`` does not *ensure* that
+            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+            a copy is made, even if not strictly necessary.
+        na_value : Any, optional
+            The value to use for missing values. The default value depends
+            on `dtype` and the type of the array.
+        Returns
+        -------
+        numpy.ndarray
+        """
+        if dtype is not None:
+            dtype = pd.api.types.pandas_dtype(dtype)
+            if copy:
+                values = np.array(self._tensor, dtype=dtype, copy=True)
+            else:
+                values = self._tensor.astype(dtype)
+        elif copy:
+            values = self._tensor.copy()
+        else:
+            values = self._tensor
+        return values
+    @property
+    def numpy_dtype(self):
+        """
+        Get the dtype of the tensor.
+        :return: The numpy dtype of the backing ndarray
+        """
+        return self._tensor.dtype
+    @property
+    def numpy_ndim(self):
+        """
+        Get the number of tensor dimensions.
+        :return: integer for the number of dimensions
+        """
+        return self._tensor.ndim
+    @property
+    def numpy_shape(self):
+        """
+        Get the shape of the tensor.
+        :return: A tuple of integers for the numpy shape of the backing ndarray
+        """
+        return self._tensor.shape
+    @property
+    def numpy_size(self):
+        """
+        Get the size of the tensor.
+        :return: integer for the number of elements in the tensor
+        """
+        return self._tensor.size
+    def astype(self, dtype, copy=True):
+        """
+        Cast to a NumPy array with 'dtype'.
+        Parameters
+        ----------
+        dtype : str or dtype
+            Typecode or data-type to which the array is cast.
+        copy : bool, default True
+            Whether to copy the data, even if not necessary. If False,
+            a copy is made only if the old dtype does not match the
+            new dtype.
+        Returns
+        -------
+        array : ndarray
+            NumPy ndarray with 'dtype' for its dtype.
+        """
+        dtype = pd.api.types.pandas_dtype(dtype)
+        if isinstance(dtype, TensorDtype):
+            values = TensorArray(self._tensor.copy()) if copy else self
+        elif not (
+            pd.api.types.is_object_dtype(dtype) and pd.api.types.is_string_dtype(dtype)
+        ):
+            values = np.array([str(t) for t in self._tensor])
+            if isinstance(dtype, pd.StringDtype):
+                return dtype.construct_array_type()._from_sequence(values, copy=False)
+            else:
+                return values
+        elif pd.api.types.is_object_dtype(dtype):
+            # Interpret astype(object) as "cast to an array of numpy arrays"
+            values = np.empty(len(self), dtype=object)
+            for i in range(len(self)):
+                values[i] = self._tensor[i]
+        else:
+            values = self._tensor.astype(dtype, copy=copy)
+        return values
+    def any(self, axis=None, out=None, keepdims=False):
+        """
+        Test whether any array element along a given axis evaluates to True.
+        See numpy.any() documentation for more information
+        https://numpy.org/doc/stable/reference/generated/numpy.any.html#numpy.any
+        :param axis: Axis or axes along which a logical OR reduction is
+            performed.
+        :param out: Alternate output array in which to place the result.
+        :param keepdims: If this is set to True, the axes which are reduced are
+            left in the result as dimensions with size one.
+        :return: single boolean unless axis is not None else TensorArray
+        """
+        result = self._tensor.any(axis=axis, out=out, keepdims=keepdims)
+        return result if axis is None else TensorArray(result)
+    def all(self, axis=None, out=None, keepdims=False):
+        """
+        Test whether all array elements along a given axis evaluate to True.
+        :param axis: Axis or axes along which a logical AND reduction is
+            performed.
+        :param out: Alternate output array in which to place the result.
+        :param keepdims: If this is set to True, the axes which are reduced are
+            left in the result as dimensions with size one.
+        :return: single boolean unless axis is not None else TensorArray
+        """
+        result = self._tensor.all(axis=axis, out=out, keepdims=keepdims)
+        return result if axis is None else TensorArray(result)
+    def __arrow_array__(self, type=None):
+        """
+        Convert this TensorArray to an ArrowTensorArray extension array.
+        This and TensorDtype.__from_arrow__ make up the
+        Pandas extension type + array <--> Arrow extension type + array
+        interoperability protocol. See
+        https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
+        for more information.
+        """
+        from ray.air.util.tensor_extensions.arrow import (
+            ArrowTensorArray,
+            ArrowVariableShapedTensorArray,
+        )
+        if self.is_variable_shaped:
+            return ArrowVariableShapedTensorArray.from_numpy(self._tensor)
+        else:
+            return ArrowTensorArray.from_numpy(self._tensor)
+    @property
+    def _is_boolean(self):
+        """
+        Whether this extension array should be considered boolean.
+        By default, ExtensionArrays are assumed to be non-numeric.
+        Setting this to True will affect the behavior of several places,
+        e.g.
+        * is_bool
+        * boolean indexing
+        Returns
+        -------
+        bool
+        """
+        # This is needed to support returning a TensorArray from .isnan().
+        return self.dtype._is_boolean()
+# Add operators from the mixin to the TensorArrayElement and TensorArray
+# classes.
+TensorArrayElement._add_arithmetic_ops()
+TensorArrayElement._add_comparison_ops()
+TensorArrayElement._add_logical_ops()
+TensorArray._add_arithmetic_ops()
+TensorArray._add_comparison_ops()
+TensorArray._add_logical_ops()
+@PublicAPI(stability="beta")
+def column_needs_tensor_extension(s: pd.Series) -> bool:
+    """Return whether the provided pandas Series column needs a tensor extension
+    representation. This tensor extension representation provides more efficient slicing
+    and interop with ML frameworks.
+    Args:
+        s: The pandas Series column that may need to be represented using the tensor
+            extension.
+    Returns:
+        Whether the provided Series needs a tensor extension representation.
+    """
+    # NOTE: This is an O(1) check.
+    return (
+        s.dtype.type is np.object_ and not s.empty and isinstance(s.iloc[0], np.ndarray)
+    )

.venv/lib/python3.11/site-packages/ray/air/util/torch_dist.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""This file is modeled after ray/python/ray/train/torch/config.py
+The logics are duplicated right now to allow maximum flexibility for
+setting up PyTorch DDP process groups outside the context of Ray Train.
+Eventually, these use cases should be consolidated.
+"""
+import os
+from abc import ABC
+from collections import defaultdict
+from datetime import timedelta
+from typing import Callable, List, T
+import torch
+import torch.distributed as dist
+import ray
+from ray.actor import ActorHandle
+from ray.air._internal.torch_utils import get_devices
+from ray.train._internal.utils import get_address_and_port
+class TorchDistributedWorker(ABC):
+    """Defines the interfaces required by the init_torch_dist_process_group().
+    This is modeled after RayTrainerWorker, which allows arbitrary functions
+    to be executed on a remote DDP worker.
+    """
+    def execute(self, func: Callable[..., T], *args, **kwargs) -> T:
+        """Executes the input function and returns the output.
+        Args:
+            func: The function to execute.
+            args, kwargs: The arguments to pass into func.
+        """
+        return func(*args, **kwargs)
+def _init_torch_distributed(
+    init_method: str,
+    backend: str,
+    rank: int,
+    world_size: int,
+    local_rank: int,
+    local_world_size: int,
+    master_addr: str,
+    master_port: str,
+    gpu_ids: List[int],
+    **init_process_group_kwargs,
+):
+    """Initialize torch distributed backend"""
+    if init_method == "env":
+        os.environ["MASTER_ADDR"] = str(master_addr)
+        os.environ["MASTER_PORT"] = str(master_port)
+        url = "env://"
+    elif init_method == "tcp":
+        url = f"tcp://{master_addr}:{master_port}"
+    else:
+        raise ValueError(
+            f"The provided init_method ("
+            f"{init_method}) is not supported. Must "
+            f"be either 'env' or 'tcp'."
+        )
+    if backend == "nccl":
+        # Same as in Ray Train
+        os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
+        # All workers on a same node should share the same set of
+        # visible GPUs. Otherwise they can't talk among themselves.
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gid) for gid in gpu_ids)
+    init_process_group_kwargs.update(
+        dict(
+            backend=backend,
+            init_method=url,
+            rank=rank,
+            world_size=world_size,
+        )
+    )
+    init_process_group_kwargs.setdefault("timeout", timedelta(seconds=1800))
+    dist.init_process_group(**init_process_group_kwargs)
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(local_rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["LOCAL_WORLD_SIZE"] = str(local_world_size)
+def _get_node_and_gpu_ids():
+    """Returns the node_id and gpu_ids for this worker."""
+    node_id = ray.get_runtime_context().get_node_id()
+    gpu_ids = ray.get_gpu_ids()
+    return node_id, gpu_ids
+def init_torch_dist_process_group(
+    workers: List[ActorHandle],
+    backend: str = "gloo",
+    init_method: str = "env",
+    **init_process_group_kwargs,
+) -> List[int]:
+    """Initialize a torch distributed process group.
+    Note: this util assumes that the order of the workers passed in
+    are their global ranks.
+    Args:
+        workers: A list of TorchDistributedWorker actors.
+        backend: The torch distributed backend to use,
+            possible choices are "gloo" or "nccl".
+        init_method: The initialization method to use,
+            possible choices are "env" or "tcp".
+        init_process_group_kwargs: Additional kwargs to pass to the call to
+            :meth:`torch.distributed.init_process_group`.
+    Returns:
+        Local ranks on their respective nodes for the list of workers.
+    """
+    if not dist.is_available():
+        raise RuntimeError("Distributed torch is not available.")
+    # Build a map from node_id to workers on that node.
+    node_and_gpu_ids = ray.get(
+        [w.execute.remote(_get_node_and_gpu_ids) for w in workers]
+    )
+    # All the workers on a specific node.
+    node_to_workers = defaultdict(list)
+    # All the gpu ids visible to all the workers on a specific node.
+    node_to_gpu_ids = defaultdict(set)
+    for i, (node_id, gpu_ids) in enumerate(node_and_gpu_ids):
+        node_to_workers[node_id].append(i)
+        # Force list.
+        if not isinstance(gpu_ids, list):
+            gpu_ids = [gpu_ids]
+        # It is possible for a worker to have access to multiple GPUs.
+        for gpu_id in gpu_ids:
+            node_to_gpu_ids[node_id].add(gpu_id)
+    # Assume the first worker is the master.
+    master_addr, master_port = ray.get(workers[0].execute.remote(get_address_and_port))
+    setup_futures = []
+    world_size = len(workers)
+    local_ranks = []
+    for rank, worker in enumerate(workers):
+        node_id = node_and_gpu_ids[rank][0]
+        local_rank = node_to_workers[node_id].index(rank)
+        local_world_size = len(node_to_workers[node_id])
+        setup_futures.append(
+            worker.execute.remote(
+                _init_torch_distributed,
+                init_method=init_method,
+                backend=backend,
+                rank=rank,
+                world_size=world_size,
+                local_rank=local_rank,
+                local_world_size=local_world_size,
+                master_addr=master_addr,
+                master_port=master_port,
+                # list(set) will sort the gpu ids, so VISIBLE_CUDA_DEVICES
+                # is always sorted.
+                gpu_ids=list(node_to_gpu_ids[node_id]),
+                **init_process_group_kwargs,
+            )
+        )
+        local_ranks.append(local_rank)
+    # Wait for all workers to join the process group.
+    ray.get(setup_futures)
+    return local_ranks
+def _shutdown_torch_distributed():
+    """Shutdown torch distributed backend"""
+    dist.destroy_process_group()
+    if not torch.cuda.is_available():
+        return
+    # Clean up cuda memory.
+    devices = get_devices()
+    for device in devices:
+        with torch.cuda.device(device):
+            torch.cuda.empty_cache()
+def shutdown_torch_dist_process_group(workers: List[ActorHandle]):
+    ray.get([w.execute.remote(_shutdown_torch_distributed) for w in workers])

.venv/lib/python3.11/site-packages/ray/air/util/transform_pyarrow.py ADDED Viewed

	@@ -0,0 +1,39 @@

+try:
+    import pyarrow
+except ImportError:
+    pyarrow = None
+def _is_column_extension_type(ca: "pyarrow.ChunkedArray") -> bool:
+    """Whether the provided Arrow Table column is an extension array, using an Arrow
+    extension type.
+    """
+    return isinstance(ca.type, pyarrow.ExtensionType)
+def _concatenate_extension_column(ca: "pyarrow.ChunkedArray") -> "pyarrow.Array":
+    """Concatenate chunks of an extension column into a contiguous array.
+    This concatenation is required for creating copies and for .take() to work on
+    extension arrays.
+    See https://issues.apache.org/jira/browse/ARROW-16503.
+    """
+    from ray.air.util.tensor_extensions.arrow import (
+        ArrowTensorArray,
+        get_arrow_extension_tensor_types,
+    )
+    if not _is_column_extension_type(ca):
+        raise ValueError("Chunked array isn't an extension array: {ca}")
+    tensor_extension_types = get_arrow_extension_tensor_types()
+    if ca.num_chunks == 0:
+        # Create empty storage array.
+        storage = pyarrow.array([], type=ca.type.storage_type)
+    elif isinstance(ca.type, tensor_extension_types):
+        return ArrowTensorArray._concat_same_type(ca.chunks)
+    else:
+        storage = pyarrow.concat_arrays([c.storage for c in ca.chunks])
+    return ca.type.__arrow_ext_class__().from_storage(ca.type, storage)

.venv/lib/python3.11/site-packages/ray/serve/_private/__pycache__/deployment_state.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fed1e2f11cb7a8c80b117de5ff1af60d479c6ccb4594d860948a52f971a6ec45
+size 125314

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/common.cpython-311.pyc ADDED Viewed

Binary file (17.5 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/handle_noop_latency.cpython-311.pyc ADDED Viewed

Binary file (2.04 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/handle_throughput.cpython-311.pyc ADDED Viewed

Binary file (2.47 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/http_noop_latency.cpython-311.pyc ADDED Viewed

Binary file (2.08 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/microbenchmark.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/__pycache__/proxy_benchmark.cpython-311.pyc ADDED Viewed

Binary file (18.1 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/common.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import asyncio
+import inspect
+import logging
+import random
+import string
+import time
+from functools import partial
+from typing import Any, Callable, Coroutine, List, Optional, Tuple
+import aiohttp
+import aiohttp.client_exceptions
+import grpc
+import numpy as np
+import pandas as pd
+from starlette.responses import StreamingResponse
+from tqdm import tqdm
+from ray import serve
+from ray.serve.generated import serve_pb2, serve_pb2_grpc
+from ray.serve.handle import DeploymentHandle
+async def run_latency_benchmark(
+    f: Callable, num_requests: int, *, num_warmup_requests: int = 100
+) -> pd.Series:
+    if inspect.iscoroutinefunction(f):
+        to_call = f
+    else:
+        async def to_call():
+            f()
+    latencies = []
+    for i in tqdm(range(num_requests + num_warmup_requests)):
+        start = time.perf_counter()
+        await to_call()
+        end = time.perf_counter()
+        # Don't include warm-up requests.
+        if i >= num_warmup_requests:
+            latencies.append(1000 * (end - start))
+    return pd.Series(latencies)
+async def run_throughput_benchmark(
+    fn: Callable[[], List[float]],
+    multiplier: int = 1,
+    num_trials: int = 10,
+    trial_runtime: float = 1,
+) -> Tuple[float, float, pd.Series]:
+    """Benchmarks throughput of a function.
+    Args:
+        fn: The function to benchmark. If this returns anything, it must
+            return a list of latencies.
+        multiplier: The number of requests or tokens (or whatever unit
+            is appropriate for this throughput benchmark) that is
+            completed in one call to `fn`.
+        num_trials: The number of trials to run.
+        trial_runtime: How long each trial should run for. During the
+            duration of one trial, `fn` will be repeatedly called.
+    Returns (mean, stddev, latencies).
+    """
+    # Warmup
+    start = time.time()
+    while time.time() - start < 0.1:
+        await fn()
+    # Benchmark
+    stats = []
+    latencies = []
+    for _ in tqdm(range(num_trials)):
+        start = time.perf_counter()
+        count = 0
+        while time.perf_counter() - start < trial_runtime:
+            res = await fn()
+            if res:
+                latencies.extend(res)
+            count += 1
+        end = time.perf_counter()
+        stats.append(multiplier * count / (end - start))
+    return round(np.mean(stats), 2), round(np.std(stats), 2), pd.Series(latencies)
+async def do_single_http_batch(
+    *,
+    batch_size: int = 100,
+    url: str = "http://localhost:8000",
+    stream: bool = False,
+) -> List[float]:
+    """Sends a batch of http requests and returns e2e latencies."""
+    # By default, aiohttp limits the number of client connections to 100.
+    # We need to use TCPConnector to configure the limit if batch size
+    # is greater than 100.
+    connector = aiohttp.TCPConnector(limit=batch_size)
+    async with aiohttp.ClientSession(
+        connector=connector, raise_for_status=True
+    ) as session:
+        async def do_query():
+            start = time.perf_counter()
+            try:
+                if stream:
+                    async with session.get(url) as r:
+                        async for chunk, _ in r.content.iter_chunks():
+                            pass
+                else:
+                    await session.get(url)
+            except aiohttp.client_exceptions.ClientConnectionError:
+                pass
+            end = time.perf_counter()
+            return 1000 * (end - start)
+        return await asyncio.gather(*[do_query() for _ in range(batch_size)])
+async def do_single_grpc_batch(
+    *, batch_size: int = 100, target: str = "localhost:9000"
+):
+    channel = grpc.aio.insecure_channel(target)
+    stub = serve_pb2_grpc.RayServeBenchmarkServiceStub(channel)
+    payload = serve_pb2.StringData(data="")
+    async def do_query():
+        start = time.perf_counter()
+        await stub.grpc_call(payload)
+        end = time.perf_counter()
+        return 1000 * (end - start)
+    return await asyncio.gather(*[do_query() for _ in range(batch_size)])
+async def collect_profile_events(coro: Coroutine):
+    """Collects profiling events using Viztracer"""
+    from viztracer import VizTracer
+    tracer = VizTracer()
+    tracer.start()
+    await coro
+    tracer.stop()
+    tracer.save()
+def generate_payload(size: int = 100, chars=string.ascii_uppercase + string.digits):
+    return "".join(random.choice(chars) for _ in range(size))
+class Blackhole:
+    def sink(self, o):
+        pass
+@serve.deployment
+class Noop:
+    def __init__(self):
+        logging.getLogger("ray.serve").setLevel(logging.WARNING)
+    def __call__(self, *args, **kwargs):
+        return b""
+@serve.deployment
+class Streamer:
+    def __init__(self, tokens_per_request: int, inter_token_delay_ms: int = 10):
+        logging.getLogger("ray.serve").setLevel(logging.WARNING)
+        self._tokens_per_request = tokens_per_request
+        self._inter_token_delay_s = inter_token_delay_ms / 1000
+    async def stream(self):
+        for _ in range(self._tokens_per_request):
+            await asyncio.sleep(self._inter_token_delay_s)
+            yield b"hi"
+    async def __call__(self):
+        return StreamingResponse(self.stream())
+@serve.deployment
+class IntermediateRouter:
+    def __init__(self, handle: DeploymentHandle):
+        logging.getLogger("ray.serve").setLevel(logging.WARNING)
+        self._handle = handle.options(stream=True)
+    async def stream(self):
+        async for token in self._handle.stream.remote():
+            yield token
+    def __call__(self):
+        return StreamingResponse(self.stream())
+@serve.deployment
+class Benchmarker:
+    def __init__(
+        self,
+        handle: DeploymentHandle,
+        stream: bool = False,
+    ):
+        logging.getLogger("ray.serve").setLevel(logging.WARNING)
+        self._handle = handle.options(stream=stream)
+        self._stream = stream
+    async def do_single_request(self, payload: Any = None) -> float:
+        """Completes a single unary request. Returns e2e latency in ms."""
+        start = time.perf_counter()
+        if payload is None:
+            await self._handle.remote()
+        else:
+            await self._handle.remote(payload)
+        end = time.perf_counter()
+        return 1000 * (end - start)
+    async def _do_single_stream(self) -> float:
+        """Consumes a single streaming request. Returns e2e latency in ms."""
+        start = time.perf_counter()
+        async for r in self._handle.stream.remote():
+            pass
+        end = time.perf_counter()
+        return 1000 * (end - start)
+    async def _do_single_batch(self, batch_size: int) -> List[float]:
+        if self._stream:
+            return await asyncio.gather(
+                *[self._do_single_stream() for _ in range(batch_size)]
+            )
+        else:
+            return await asyncio.gather(
+                *[self.do_single_request() for _ in range(batch_size)]
+            )
+    async def run_latency_benchmark(
+        self, *, num_requests: int, payload: Any = None
+    ) -> pd.Series:
+        async def f():
+            await self.do_single_request(payload)
+        return await run_latency_benchmark(f, num_requests=num_requests)
+    async def run_throughput_benchmark(
+        self,
+        *,
+        batch_size: int,
+        num_trials: int,
+        trial_runtime: float,
+        tokens_per_request: Optional[float] = None,
+    ) -> Tuple[float, float]:
+        if self._stream:
+            assert tokens_per_request
+            multiplier = tokens_per_request * batch_size
+        else:
+            multiplier = batch_size
+        return await run_throughput_benchmark(
+            fn=partial(
+                self._do_single_batch,
+                batch_size=batch_size,
+            ),
+            multiplier=multiplier,
+            num_trials=num_trials,
+            trial_runtime=trial_runtime,
+        )

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/handle_noop_latency.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import time
+import click
+import pandas as pd
+from ray import serve
+from ray.serve._private.benchmarks.common import Benchmarker, Noop
+from ray.serve.handle import DeploymentHandle
+@click.command(help="Benchmark no-op DeploymentHandle latency.")
+@click.option("--num-replicas", type=int, default=1)
+@click.option("--num-requests", type=int, default=100)
+def main(num_replicas: int, num_requests: int):
+    h: DeploymentHandle = serve.run(
+        Benchmarker.bind(Noop.options(num_replicas=num_replicas).bind())
+    )
+    latencies: pd.Series = h.run_latency_benchmark.remote(
+        num_requests,
+    ).result()
+    # Let the logs flush to avoid interwoven output.
+    time.sleep(1)
+    print(
+        "Latency (ms) for noop DeploymentHandle requests "
+        f"(num_replicas={num_replicas},num_requests={num_requests}):"
+    )
+    print(latencies.describe(percentiles=[0.5, 0.9, 0.95, 0.99]))
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/handle_throughput.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import click
+from ray import serve
+from ray.serve._private.benchmarks.common import Benchmarker, Hello
+from ray.serve.handle import DeploymentHandle
+@click.command(help="Benchmark deployment handle throughput.")
+@click.option(
+    "--batch-size",
+    type=int,
+    default=100,
+    help="Number of requests to send to downstream deployment in each trial.",
+)
+@click.option(
+    "--num-replicas",
+    type=int,
+    default=1,
+    help="Number of replicas in the downstream deployment.",
+)
+@click.option(
+    "--num-trials",
+    type=int,
+    default=5,
+    help="Number of trials of the benchmark to run.",
+)
+@click.option(
+    "--trial-runtime",
+    type=int,
+    default=1,
+    help="Duration to run each trial of the benchmark for (seconds).",
+)
+def main(
+    batch_size: int,
+    num_replicas: int,
+    num_trials: int,
+    trial_runtime: float,
+):
+    app = Benchmarker.bind(
+        Hello.options(
+            num_replicas=num_replicas, ray_actor_options={"num_cpus": 0}
+        ).bind(),
+    )
+    h: DeploymentHandle = serve.run(app)
+    mean, stddev = h.run_throughput_benchmark.remote(
+        batch_size=batch_size,
+        num_trials=num_trials,
+        trial_runtime=trial_runtime,
+    ).result()
+    print(
+        "DeploymentHandle throughput {}: {} +- {} requests/s".format(
+            f"(num_replicas={num_replicas}, batch_size={batch_size})",
+            mean,
+            stddev,
+        )
+    )
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/http_noop_latency.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import asyncio
+import click
+import pandas as pd
+import requests
+from ray import serve
+from ray.serve._private.benchmarks.common import Noop, run_latency_benchmark
+@click.command(help="Benchmark no-op HTTP latency.")
+@click.option("--num-replicas", type=int, default=1)
+@click.option("--num-requests", type=int, default=100)
+def main(num_replicas: int, num_requests: int):
+    serve.run(Noop.options(num_replicas=num_replicas).bind())
+    latencies: pd.Series = asyncio.new_event_loop().run_until_complete(
+        run_latency_benchmark(
+            lambda: requests.get("http://localhost:8000"),
+            num_requests=num_requests,
+        )
+    )
+    print(
+        "Latency (ms) for noop HTTP requests "
+        f"(num_replicas={num_replicas},num_requests={num_requests}):"
+    )
+    print(latencies.describe(percentiles=[0.5, 0.9, 0.95, 0.99]))
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/microbenchmark.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# Runs several scenarios with varying max batch size, max concurrent queries,
+# number of replicas, and with intermediate serve handles (to simulate ensemble
+# models) either on or off.
+import asyncio
+import logging
+from pprint import pprint
+from typing import Dict, Union
+import aiohttp
+from starlette.requests import Request
+import ray
+from ray import serve
+from ray.serve._private.benchmarks.common import run_throughput_benchmark
+from ray.serve.handle import DeploymentHandle
+NUM_CLIENTS = 8
+CALLS_PER_BATCH = 100
+async def fetch(session, data):
+    async with session.get("http://localhost:8000/", data=data) as response:
+        response = await response.text()
+        assert response == "ok", response
+@ray.remote
+class Client:
+    def ready(self):
+        return "ok"
+    async def do_queries(self, num, data):
+        async with aiohttp.ClientSession() as session:
+            for _ in range(num):
+                await fetch(session, data)
+def build_app(
+    intermediate_handles: bool,
+    num_replicas: int,
+    max_batch_size: int,
+    max_ongoing_requests: int,
+):
+    @serve.deployment(max_ongoing_requests=1000)
+    class Upstream:
+        def __init__(self, handle: DeploymentHandle):
+            self._handle = handle
+            # Turn off access log.
+            logging.getLogger("ray.serve").setLevel(logging.WARNING)
+        async def __call__(self, req: Request):
+            return await self._handle.remote(await req.body())
+    @serve.deployment(
+        num_replicas=num_replicas,
+        max_ongoing_requests=max_ongoing_requests,
+    )
+    class Downstream:
+        def __init__(self):
+            # Turn off access log.
+            logging.getLogger("ray.serve").setLevel(logging.WARNING)
+        @serve.batch(max_batch_size=max_batch_size)
+        async def batch(self, reqs):
+            return [b"ok"] * len(reqs)
+        async def __call__(self, req: Union[bytes, Request]):
+            if max_batch_size > 1:
+                return await self.batch(req)
+            else:
+                return b"ok"
+    if intermediate_handles:
+        return Upstream.bind(Downstream.bind())
+    else:
+        return Downstream.bind()
+async def trial(
+    intermediate_handles: bool,
+    num_replicas: int,
+    max_batch_size: int,
+    max_ongoing_requests: int,
+    data_size: str,
+) -> Dict[str, float]:
+    results = {}
+    trial_key_base = (
+        f"replica:{num_replicas}/batch_size:{max_batch_size}/"
+        f"concurrent_queries:{max_ongoing_requests}/"
+        f"data_size:{data_size}/intermediate_handle:{intermediate_handles}"
+    )
+    print(
+        f"intermediate_handles={intermediate_handles},"
+        f"num_replicas={num_replicas},"
+        f"max_batch_size={max_batch_size},"
+        f"max_ongoing_requests={max_ongoing_requests},"
+        f"data_size={data_size}"
+    )
+    app = build_app(
+        intermediate_handles, num_replicas, max_batch_size, max_ongoing_requests
+    )
+    serve.run(app)
+    if data_size == "small":
+        data = None
+    elif data_size == "large":
+        data = b"a" * 1024 * 1024
+    else:
+        raise ValueError("data_size should be 'small' or 'large'.")
+    async with aiohttp.ClientSession() as session:
+        async def single_client():
+            for _ in range(CALLS_PER_BATCH):
+                await fetch(session, data)
+        single_client_avg_tps, single_client_std_tps = await run_throughput_benchmark(
+            single_client,
+            multiplier=CALLS_PER_BATCH,
+        )
+        print(
+            "\t{} {} +- {} requests/s".format(
+                "single client {} data".format(data_size),
+                single_client_avg_tps,
+                single_client_std_tps,
+            )
+        )
+        key = f"num_client:1/{trial_key_base}"
+        results[key] = single_client_avg_tps
+    clients = [Client.remote() for _ in range(NUM_CLIENTS)]
+    ray.get([client.ready.remote() for client in clients])
+    async def many_clients():
+        ray.get([a.do_queries.remote(CALLS_PER_BATCH, data) for a in clients])
+    multi_client_avg_tps, _ = await run_throughput_benchmark(
+        many_clients,
+        multiplier=CALLS_PER_BATCH * len(clients),
+    )
+    results[f"num_client:{len(clients)}/{trial_key_base}"] = multi_client_avg_tps
+    return results
+async def main():
+    results = {}
+    for intermediate_handles in [False, True]:
+        for num_replicas in [1, 8]:
+            for max_batch_size, max_ongoing_requests in [
+                (1, 1),
+                (1, 10000),
+                (10000, 10000),
+            ]:
+                # TODO(edoakes): large data causes broken pipe errors.
+                for data_size in ["small"]:
+                    results.update(
+                        await trial(
+                            intermediate_handles,
+                            num_replicas,
+                            max_batch_size,
+                            max_ongoing_requests,
+                            data_size,
+                        )
+                    )
+    print("Results from all conditions:")
+    pprint(results)
+    return results
+if __name__ == "__main__":
+    ray.init()
+    serve.start()
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    loop.run_until_complete(main())

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/proxy_benchmark.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Runs some request ping to compare HTTP and gRPC performances in TPS and latency.
+# Note: this takes around 1 hour to run.
+import asyncio
+import json
+import logging
+import time
+from random import random
+from typing import Callable, Dict
+import aiohttp
+import numpy as np
+import pandas as pd
+from grpc import aio
+from starlette.requests import Request
+import ray
+from ray import serve
+from ray.serve._private.common import RequestProtocol
+from ray.serve.config import gRPCOptions
+from ray.serve.generated import serve_pb2, serve_pb2_grpc
+from ray.serve.handle import DeploymentHandle
+CALLS_PER_BATCH = 100
+DELTA = 10**-7
+async def get_query_tps(name: str, fn: Callable, multiplier: int = CALLS_PER_BATCH):
+    """Get query TPS.
+    Run the function for 0.5 seconds 10 times to calculate how many requests can
+    be completed. And use those stats to calculate the mean and std of TPS.
+    """
+    # warmup
+    start = time.time()
+    while time.time() - start < 0.1:
+        await fn()
+    # real run
+    stats = []
+    for _ in range(10):
+        count = 0
+        start = time.time()
+        while time.time() - start < 0.5:
+            await fn()
+            count += 1
+        end = time.time()
+        stats.append(multiplier * count / (end - start))
+    tps_mean = round(np.mean(stats), 2)
+    tps_std = round(np.std(stats), 2)
+    print(f"\t{name} {tps_mean} +- {tps_std} requests/s")
+    return tps_mean, tps_std
+async def get_query_latencies(name: str, fn: Callable):
+    """Get query latencies.
+    Take all the latencies from the function and calculate the mean and std.
+    """
+    many_client_results = np.asarray(await fn())
+    many_client_results.flatten()
+    latency_ms_mean = round(np.mean(many_client_results) * 1000, 2)
+    latency_ms_std = round(np.std(many_client_results) * 1000, 2)
+    print(f"\t{name} {latency_ms_mean} +- {latency_ms_std} ms")
+    return latency_ms_mean, latency_ms_std
+async def fetch_http(session, data):
+    data_json = {"nums": data}
+    response = await session.get("http://localhost:8000/", json=data_json)
+    response_text = await response.read()
+    float(response_text.decode())
+async def fetch_grpc(stub, data):
+    result = await stub.grpc_call(serve_pb2.RawData(nums=data))
+    result.output
+@ray.remote
+class HTTPClient:
+    def ready(self):
+        return "ok"
+    async def do_queries(self, num, data):
+        async with aiohttp.ClientSession() as session:
+            for _ in range(num):
+                await fetch_http(session, data)
+    async def time_queries(self, num, data):
+        stats = []
+        async with aiohttp.ClientSession() as session:
+            for _ in range(num):
+                start = time.time()
+                await fetch_http(session, data)
+                end = time.time()
+                stats.append(end - start)
+        return stats
+@ray.remote
+class gRPCClient:
+    def __init__(self):
+        channel = aio.insecure_channel("localhost:9000")
+        self.stub = serve_pb2_grpc.RayServeBenchmarkServiceStub(channel)
+    def ready(self):
+        return "ok"
+    async def do_queries(self, num, data):
+        for _ in range(num):
+            await fetch_grpc(self.stub, data)
+    async def time_queries(self, num, data):
+        stats = []
+        for _ in range(num):
+            start = time.time()
+            await fetch_grpc(self.stub, data)
+            end = time.time()
+            stats.append(end - start)
+        return stats
+def build_app(
+    num_replicas: int,
+    max_ongoing_requests: int,
+    data_size: int,
+):
+    @serve.deployment(max_ongoing_requests=1000)
+    class DataPreprocessing:
+        def __init__(self, handle: DeploymentHandle):
+            self._handle = handle
+            # Turn off access log.
+            logging.getLogger("ray.serve").setLevel(logging.WARNING)
+        def normalize(self, raw: np.ndarray) -> np.ndarray:
+            return (raw - np.min(raw)) / (np.max(raw) - np.min(raw) + DELTA)
+        async def __call__(self, req: Request):
+            """HTTP entrypoint.
+            It parses the request, normalize the data, and send to model for inference.
+            """
+            body = json.loads(await req.body())
+            raw = np.asarray(body["nums"])
+            processed = self.normalize(raw)
+            return await self._handle.remote(processed)
+        async def grpc_call(self, raq_data):
+            """gRPC entrypoint.
+            It parses the request, normalize the data, and send to model for inference.
+            """
+            raw = np.asarray(raq_data.nums)
+            processed = self.normalize(raw)
+            output = await self._handle.remote(processed)
+            return serve_pb2.ModelOutput(output=output)
+        async def call_with_string(self, raq_data):
+            """gRPC entrypoint."""
+            return serve_pb2.ModelOutput(output=0)
+    @serve.deployment(
+        num_replicas=num_replicas,
+        max_ongoing_requests=max_ongoing_requests,
+    )
+    class ModelInference:
+        def __init__(self):
+            # Turn off access log.
+            logging.getLogger("ray.serve").setLevel(logging.WARNING)
+            self._model = np.random.randn(data_size, data_size)
+        async def __call__(self, processed: np.ndarray) -> float:
+            # Run a dot product with a random matrix to simulate a model inference.
+            model_output = np.dot(processed, self._model)
+            return sum(model_output)
+    return DataPreprocessing.bind(ModelInference.bind())
+async def trial(
+    num_replicas: int,
+    max_ongoing_requests: int,
+    data_size: int,
+    num_clients: int,
+    proxy: RequestProtocol,
+) -> Dict[str, float]:
+    # Generate input data as array of random floats.
+    data = [random() for _ in range(data_size)]
+    # Build and deploy the app.
+    app = build_app(
+        num_replicas=num_replicas,
+        max_ongoing_requests=max_ongoing_requests,
+        data_size=data_size,
+    )
+    serve.run(app)
+    # Start clients.
+    if proxy == RequestProtocol.GRPC:
+        clients = [gRPCClient.remote() for _ in range(num_clients)]
+    elif proxy == RequestProtocol.HTTP:
+        clients = [HTTPClient.remote() for _ in range(num_clients)]
+    ray.get([client.ready.remote() for client in clients])
+    async def client_time_queries():
+        return ray.get([a.time_queries.remote(CALLS_PER_BATCH, data) for a in clients])
+    async def client_do_queries():
+        ray.get([a.do_queries.remote(CALLS_PER_BATCH, data) for a in clients])
+    trial_key_base = (
+        f"proxy:{proxy}/"
+        f"num_client:{num_clients}/"
+        f"replica:{num_replicas}/"
+        f"concurrent_queries:{max_ongoing_requests}/"
+        f"data_size:{data_size}"
+    )
+    tps_mean, tps_sdt = await get_query_tps(
+        trial_key_base,
+        client_do_queries,
+    )
+    latency_ms_mean, latency_ms_std = await get_query_latencies(
+        trial_key_base,
+        client_time_queries,
+    )
+    results = {
+        "proxy": proxy.value,
+        "num_client": num_clients,
+        "replica": num_replicas,
+        "concurrent_queries": max_ongoing_requests,
+        "data_size": data_size,
+        "tps_mean": tps_mean,
+        "tps_sdt": tps_sdt,
+        "latency_ms_mean": latency_ms_mean,
+        "latency_ms_std": latency_ms_std,
+    }
+    return results
+async def main():
+    start_time = time.time()
+    results = []
+    for num_replicas in [1, 8]:
+        for max_ongoing_requests in [1, 10_000]:
+            for data_size in [1, 100, 10_000]:
+                for num_clients in [1, 8]:
+                    for proxy in [RequestProtocol.GRPC, RequestProtocol.HTTP]:
+                        results.append(
+                            await trial(
+                                num_replicas=num_replicas,
+                                max_ongoing_requests=max_ongoing_requests,
+                                data_size=data_size,
+                                num_clients=num_clients,
+                                proxy=proxy,
+                            )
+                        )
+    print(f"Total time: {time.time() - start_time}s")
+    print("results", results)
+    df = pd.DataFrame.from_dict(results)
+    df = df.sort_values(
+        by=["proxy", "num_client", "replica", "concurrent_queries", "data_size"]
+    )
+    print("Results from all conditions:")
+    # Print the results in with tab separated so we can copy into google sheets.
+    for i in range(len(df.index)):
+        row = list(df.iloc[i])
+        print("\t".join(map(str, row)))
+if __name__ == "__main__":
+    ray.init()
+    grpc_port = 9000
+    grpc_servicer_functions = [
+        "ray.serve.generated.serve_pb2_grpc."
+        "add_RayServeBenchmarkServiceServicer_to_server",
+    ]
+    serve.start(
+        grpc_options=gRPCOptions(
+            port=grpc_port,
+            grpc_servicer_functions=grpc_servicer_functions,
+        )
+    )
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    loop.run_until_complete(main())

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (216 Bytes). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/__pycache__/common.cpython-311.pyc ADDED Viewed

Binary file (1.53 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/__pycache__/serialization_benchmark.cpython-311.pyc ADDED Viewed

Binary file (7.37 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/common.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from dataclasses import dataclass
+from typing import List, Optional
+from pydantic import BaseModel
+#
+# NOTE: PLEASE READ CAREFULLY BEFORE CHANGING
+#
+# Payloads in this module are purposefully extracted from benchmark file to force
+# Ray's cloudpickle behavior when it does NOT serialize the class definition itself
+# along with its payload (instead relying on it being imported)
+#
+class PayloadPydantic(BaseModel):
+    text: Optional[str] = None
+    floats: Optional[List[float]] = None
+    ints: Optional[List[int]] = None
+    ts: Optional[float] = None
+    reason: Optional[str] = None
+@dataclass
+class PayloadDataclass:
+    text: Optional[str] = None
+    floats: Optional[List[float]] = None
+    ints: Optional[List[int]] = None
+    ts: Optional[float] = None
+    reason: Optional[str] = None

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/serialization/serialization_benchmark.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import asyncio
+import enum
+import pickle
+import time
+from typing import Any, Callable
+import click
+import msgpack
+from ray._private.serialization import SerializationContext
+from ray.cloudpickle import cloudpickle_fast
+from ray.serve._private.benchmarks.common import (
+    collect_profile_events,
+    run_latency_benchmark,
+)
+from ray.serve._private.benchmarks.serialization.common import (
+    PayloadDataclass,
+    PayloadPydantic,
+)
+class PayloadType(enum.Enum):
+    PYDANTIC = "pydantic"
+    DATACLASS = "dataclass"
+class SerializerType(enum.Enum):
+    RAY = "ray"
+    PICKLE = "pickle"
+    CLOUDPICKLE = "cloudpickle"
+    MSGPACK = "msgpack"
+_PERCENTILES = [0.5, 0.99]
+sc = SerializationContext(None)
+def _create_model(cls):
+    return cls(
+        text="Test output",
+        floats=[float(f) for f in range(1, 100)],
+        ints=list(range(1, 100)),
+        ts=time.time(),
+        reason="Success!",
+    )
+def _blackhole(o):
+    """Placeholder to be used in the benchmark to make sure runtime
+    doesn't optimize out unused results"""
+    pass
+async def run_serializer_benchmark(
+    model, serializer: Callable[[Any], bytes], iterations: int
+):
+    def _serde_loop():
+        bs = serializer(model)
+        _blackhole(bs)
+    pd = await run_latency_benchmark(_serde_loop, iterations)
+    print("Latencies (ms):\n", pd.describe(percentiles=_PERCENTILES))
+@click.command(help="Benchmark serialization latency")
+@click.option(
+    "--trials",
+    type=int,
+    default=1000,
+    help="Total number of trials to run in a single benchmark run",
+)
+@click.option(
+    "--batch-size",
+    type=int,
+    default=10,
+    help="Controls how many objects are contained in a serialized batch",
+)
+@click.option(
+    "--payload-type",
+    type=PayloadType,
+    help="Target type of the payload to be benchmarked (supported: pydantic, "
+    "dataclass)",
+)
+@click.option(
+    "--serializer",
+    type=SerializerType,
+    help="Target type of the serializer to be benchmarked (supported: ray, pickle, "
+    "cloudpickle, msgpack)",
+)
+@click.option(
+    "--profile-events",
+    type=bool,
+    default=False,
+)
+def main(
+    trials: int,
+    batch_size: int,
+    payload_type: PayloadType,
+    serializer: SerializerType,
+    profile_events: bool,
+):
+    if serializer == SerializerType.RAY:
+        def _serialize(obj):
+            so = sc.serialize(obj)
+            bs = so.to_bytes()
+            return bs
+    elif serializer == SerializerType.CLOUDPICKLE:
+        def _serialize(obj):
+            bs = cloudpickle_fast.dumps(obj)
+            return bs
+    elif serializer == SerializerType.PICKLE:
+        def _serialize(obj):
+            bs = pickle.dumps(obj)
+            return bs
+    elif serializer == SerializerType.MSGPACK:
+        def _dumps(obj):
+            bs = msgpack.dumps(obj.__dict__)
+            # print(f"Bytes ({len(bs)}): ", bs)
+            return bs
+        def _loads(bs):
+            dict = msgpack.loads(bs)
+            return PayloadPydantic(**dict)
+        sc._register_cloudpickle_serializer(PayloadPydantic, _dumps, _loads)
+        def _serialize(obj):
+            so = sc.serialize(obj)
+            bs = so.to_bytes()
+            return bs
+    else:
+        raise NotImplementedError(serializer)
+    if payload_type == PayloadType.PYDANTIC:
+        model = _create_model(PayloadPydantic)
+    elif payload_type == PayloadType.DATACLASS:
+        model = _create_model(PayloadDataclass)
+    else:
+        raise NotImplementedError(f"Not supported ({payload_type})")
+    payload = [model.copy(deep=True) for _ in range(batch_size)]
+    routine = run_serializer_benchmark(payload, _serialize, trials)
+    if profile_events:
+        routine = collect_profile_events(routine)
+    asyncio.run(routine)
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (212 Bytes). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/common.cpython-311.pyc ADDED Viewed

Binary file (7.2 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/streaming_core_throughput.cpython-311.pyc ADDED Viewed

Binary file (4.08 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/streaming_grpc_throughput.cpython-311.pyc ADDED Viewed

Binary file (9.07 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/streaming_handle_throughput.cpython-311.pyc ADDED Viewed

Binary file (4.19 kB). View file

.venv/lib/python3.11/site-packages/ray/serve/_private/benchmarks/streaming/__pycache__/streaming_http_throughput.cpython-311.pyc ADDED Viewed

Binary file (7.7 kB). View file