koichi12 commited on Feb 12, 2025

Commit

747c195

verified ·

1 Parent(s): 697a7f6

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/observation_function.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sample_batch_builder.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/worker_set.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/agent_collector.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/sample_collector.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/simple_list_collector.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/agent_collector.py +688 -0
.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/sample_collector.py +298 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/complex_input_net.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_modelv2.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__init__.py +13 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/gru_gate.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/multi_head_attention.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/noisy_layer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/relative_multi_head_attention.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/skip_connection.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/skip_connection.py +43 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/d4rl_reader.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/input_reader.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/resource.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/shuffled_input.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__init__.py +15 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/doubly_robust.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/feature_importance.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/importance_sampling.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/weighted_importance_sampling.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.py +180 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.py +253 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/fqe_torch_model.py +297 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/importance_sampling.py +126 -0
.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/off_policy_estimator.py +248 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__init__.py +141 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/checkpoints.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/compression.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/deprecation.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/from_config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/lambda_defaultdict.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/memory.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/serialization.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/torch_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/actors.py +258 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/annotations.py +213 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/checkpoints.py +1045 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/deprecation.py +134 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/error.py +128 -0
.venv/lib/python3.11/site-packages/ray/rllib/utils/filter_manager.py +82 -0

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/observation_function.cpython-311.pyc ADDED Viewed

Binary file (3.98 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sample_batch_builder.cpython-311.pyc ADDED Viewed

Binary file (14.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/worker_set.cpython-311.pyc ADDED Viewed

Binary file (713 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (204 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/agent_collector.cpython-311.pyc ADDED Viewed

Binary file (27.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/sample_collector.cpython-311.pyc ADDED Viewed

Binary file (14 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/simple_list_collector.cpython-311.pyc ADDED Viewed

Binary file (28.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/agent_collector.py ADDED Viewed

	@@ -0,0 +1,688 @@

+import copy
+import logging
+import math
+from typing import Any, Dict, List, Optional
+import numpy as np
+import tree  # pip install dm_tree
+from gymnasium.spaces import Space
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.spaces.space_utils import (
+    flatten_to_single_ndarray,
+    get_dummy_batch_for_space,
+)
+from ray.rllib.utils.typing import (
+    EpisodeID,
+    EnvID,
+    TensorType,
+    ViewRequirementsDict,
+)
+logger = logging.getLogger(__name__)
+torch, _ = try_import_torch()
+def _to_float_np_array(v: List[Any]) -> np.ndarray:
+    if torch and torch.is_tensor(v[0]):
+        raise ValueError
+    arr = np.array(v)
+    if arr.dtype == np.float64:
+        return arr.astype(np.float32)  # save some memory
+    return arr
+def _get_buffered_slice_with_paddings(d, inds):
+    element_at_t = []
+    for index in inds:
+        if index < len(d):
+            element_at_t.append(d[index])
+        else:
+            # zero pad similar to the last element.
+            element_at_t.append(tree.map_structure(np.zeros_like, d[-1]))
+    return element_at_t
+@OldAPIStack
+class AgentCollector:
+    """Collects samples for one agent in one trajectory (episode).
+    The agent may be part of a multi-agent environment. Samples are stored in
+    lists including some possible automatic "shift" buffer at the beginning to
+    be able to save memory when storing things like NEXT_OBS, PREV_REWARDS,
+    etc.., which are specified using the trajectory view API.
+    """
+    _next_unroll_id = 0  # disambiguates unrolls within a single episode
+    # TODO: @kourosh add different types of padding. e.g. zeros vs. same
+    def __init__(
+        self,
+        view_reqs: ViewRequirementsDict,
+        *,
+        max_seq_len: int = 1,
+        disable_action_flattening: bool = True,
+        intial_states: Optional[List[TensorType]] = None,
+        is_policy_recurrent: bool = False,
+        is_training: bool = True,
+        _enable_new_api_stack: bool = False,
+    ):
+        """Initialize an AgentCollector.
+        Args:
+            view_reqs: A dict of view requirements for the agent.
+            max_seq_len: The maximum sequence length to store.
+            disable_action_flattening: If True, don't flatten the action.
+            intial_states: The initial states from the policy.get_initial_states()
+            is_policy_recurrent: If True, the policy is recurrent.
+            is_training: Sets the is_training flag for the buffers. if True, all the
+                timesteps are stored in the buffers until explictly build_for_training
+                () is called. if False, only the content required for the last time
+                step is stored in the buffers. This will save memory during inference.
+                You can change the behavior at runtime by calling is_training(mode).
+        """
+        self.max_seq_len = max_seq_len
+        self.disable_action_flattening = disable_action_flattening
+        self.view_requirements = view_reqs
+        # The initial_states can be an np array
+        self.initial_states = intial_states if intial_states is not None else []
+        self.is_policy_recurrent = is_policy_recurrent
+        self._is_training = is_training
+        self._enable_new_api_stack = _enable_new_api_stack
+        # Determine the size of the buffer we need for data before the actual
+        # episode starts. This is used for 0-buffering of e.g. prev-actions,
+        # or internal state inputs.
+        view_req_shifts = [
+            min(vr.shift_arr)
+            - int((vr.data_col or k) in [SampleBatch.OBS, SampleBatch.INFOS])
+            for k, vr in view_reqs.items()
+        ]
+        self.shift_before = -min(view_req_shifts)
+        # The actual data buffers. Keys are column names, values are lists
+        # that contain the sub-components (e.g. for complex obs spaces) with
+        # each sub-component holding a list of per-timestep tensors.
+        # E.g.: obs-space = Dict(a=Discrete(2), b=Box((2,)))
+        # buffers["obs"] = [
+        #    [0, 1],  # <- 1st sub-component of observation
+        #    [np.array([.2, .3]), np.array([.0, -.2])]  # <- 2nd sub-component
+        # ]
+        # NOTE: infos and state_out... are not flattened due to them often
+        # using custom dict values whose structure may vary from timestep to
+        # timestep.
+        self.buffers: Dict[str, List[List[TensorType]]] = {}
+        # Maps column names to an example data item, which may be deeply
+        # nested. These are used such that we'll know how to unflatten
+        # the flattened data inside self.buffers when building the
+        # SampleBatch.
+        self.buffer_structs: Dict[str, Any] = {}
+        # The episode ID for the agent for which we collect data.
+        self.episode_id = None
+        # The unroll ID, unique across all rollouts (within a RolloutWorker).
+        self.unroll_id = None
+        # The simple timestep count for this agent. Gets increased by one
+        # each time a (non-initial!) observation is added.
+        self.agent_steps = 0
+        # Keep track of view requirements that have a view on columns that we gain from
+        # inference and also need for inference. These have dummy values appended in
+        # buffers to account for the missing value when building for inference
+        # Example: We have one 'state_in' view requirement that has a view on our
+        # state_outs at t=[-10, ..., -1]. At any given build_for_inference()-call,
+        # the buffer must contain eleven values from t=[-10, ..., 0] for us to index
+        # properly. Since state_out at t=0 is missing, we substitute it with a buffer
+        # value that should never make it into batches built for training.
+        self.data_cols_with_dummy_values = set()
+    @property
+    def training(self) -> bool:
+        return self._is_training
+    def is_training(self, is_training: bool) -> None:
+        self._is_training = is_training
+    def is_empty(self) -> bool:
+        """Returns True if this collector has no data."""
+        return not self.buffers or all(len(item) == 0 for item in self.buffers.values())
+    def add_init_obs(
+        self,
+        episode_id: EpisodeID,
+        agent_index: int,
+        env_id: EnvID,
+        init_obs: TensorType,
+        init_infos: Optional[Dict[str, TensorType]] = None,
+        t: int = -1,
+    ) -> None:
+        """Adds an initial observation (after reset) to the Agent's trajectory.
+        Args:
+            episode_id: Unique ID for the episode we are adding the
+                initial observation for.
+            agent_index: Unique int index (starting from 0) for the agent
+                within its episode. Not to be confused with AGENT_ID (Any).
+            env_id: The environment index (in a vectorized setup).
+            init_obs: The initial observation tensor (after `env.reset()`).
+            init_infos: The initial infos dict (after `env.reset()`).
+            t: The time step (episode length - 1). The initial obs has
+                ts=-1(!), then an action/reward/next-obs at t=0, etc..
+        """
+        # Store episode ID + unroll ID, which will be constant throughout this
+        # AgentCollector's lifecycle.
+        self.episode_id = episode_id
+        if self.unroll_id is None:
+            self.unroll_id = AgentCollector._next_unroll_id
+            AgentCollector._next_unroll_id += 1
+        # convert init_obs to np.array (in case it is a list)
+        if isinstance(init_obs, list):
+            init_obs = np.array(init_obs)
+        if SampleBatch.OBS not in self.buffers:
+            single_row = {
+                SampleBatch.OBS: init_obs,
+                SampleBatch.INFOS: init_infos or {},
+                SampleBatch.AGENT_INDEX: agent_index,
+                SampleBatch.ENV_ID: env_id,
+                SampleBatch.T: t,
+                SampleBatch.EPS_ID: self.episode_id,
+                SampleBatch.UNROLL_ID: self.unroll_id,
+            }
+            # TODO (Artur): Remove when PREV_ACTIONS and PREV_REWARDS get deprecated.
+            # Note (Artur): As long as we have these in our default view requirements,
+            # we should  build buffers with neutral elements instead of building them
+            # on the first AgentCollector.build_for_inference call if present.
+            # This prevents us from accidentally building buffers with duplicates of
+            # the first incoming value.
+            if SampleBatch.PREV_REWARDS in self.view_requirements:
+                single_row[SampleBatch.REWARDS] = get_dummy_batch_for_space(
+                    space=self.view_requirements[SampleBatch.REWARDS].space,
+                    batch_size=0,
+                    fill_value=0.0,
+                )
+            if SampleBatch.PREV_ACTIONS in self.view_requirements:
+                potentially_flattened_batch = get_dummy_batch_for_space(
+                    space=self.view_requirements[SampleBatch.ACTIONS].space,
+                    batch_size=0,
+                    fill_value=0.0,
+                )
+                if not self.disable_action_flattening:
+                    potentially_flattened_batch = flatten_to_single_ndarray(
+                        potentially_flattened_batch
+                    )
+                single_row[SampleBatch.ACTIONS] = potentially_flattened_batch
+            self._build_buffers(single_row)
+        # Append data to existing buffers.
+        flattened = tree.flatten(init_obs)
+        for i, sub_obs in enumerate(flattened):
+            self.buffers[SampleBatch.OBS][i].append(sub_obs)
+        self.buffers[SampleBatch.INFOS][0].append(init_infos or {})
+        self.buffers[SampleBatch.AGENT_INDEX][0].append(agent_index)
+        self.buffers[SampleBatch.ENV_ID][0].append(env_id)
+        self.buffers[SampleBatch.T][0].append(t)
+        self.buffers[SampleBatch.EPS_ID][0].append(self.episode_id)
+        self.buffers[SampleBatch.UNROLL_ID][0].append(self.unroll_id)
+    def add_action_reward_next_obs(self, input_values: Dict[str, TensorType]) -> None:
+        """Adds the given dictionary (row) of values to the Agent's trajectory.
+        Args:
+            values: Data dict (interpreted as a single row) to be added to buffer.
+                Must contain keys:
+                SampleBatch.ACTIONS, REWARDS, TERMINATEDS, TRUNCATEDS, and NEXT_OBS.
+        """
+        if self.unroll_id is None:
+            self.unroll_id = AgentCollector._next_unroll_id
+            AgentCollector._next_unroll_id += 1
+        # Next obs -> obs.
+        values = copy.copy(input_values)
+        assert SampleBatch.OBS not in values
+        values[SampleBatch.OBS] = values[SampleBatch.NEXT_OBS]
+        del values[SampleBatch.NEXT_OBS]
+        # convert obs to np.array (in case it is a list)
+        if isinstance(values[SampleBatch.OBS], list):
+            values[SampleBatch.OBS] = np.array(values[SampleBatch.OBS])
+        # Default to next timestep if not provided in input values
+        if SampleBatch.T not in input_values:
+            values[SampleBatch.T] = self.buffers[SampleBatch.T][0][-1] + 1
+        # Make sure EPS_ID/UNROLL_ID stay the same for this agent.
+        if SampleBatch.EPS_ID in values:
+            assert values[SampleBatch.EPS_ID] == self.episode_id
+            del values[SampleBatch.EPS_ID]
+        self.buffers[SampleBatch.EPS_ID][0].append(self.episode_id)
+        if SampleBatch.UNROLL_ID in values:
+            assert values[SampleBatch.UNROLL_ID] == self.unroll_id
+            del values[SampleBatch.UNROLL_ID]
+        self.buffers[SampleBatch.UNROLL_ID][0].append(self.unroll_id)
+        for k, v in values.items():
+            if k not in self.buffers:
+                if self.training and k.startswith("state_out"):
+                    vr = self.view_requirements[k]
+                    data_col = vr.data_col or k
+                    self._fill_buffer_with_initial_values(
+                        data_col, vr, build_for_inference=False
+                    )
+                else:
+                    self._build_buffers({k: v})
+            # Do not flatten infos, state_out and (if configured) actions.
+            # Infos/state-outs may be structs that change from timestep to
+            # timestep.
+            should_flatten_action_key = (
+                k == SampleBatch.ACTIONS and not self.disable_action_flattening
+            )
+            # Note (Artur) RL Modules's states need no flattening
+            should_flatten_state_key = (
+                k.startswith("state_out") and not self._enable_new_api_stack
+            )
+            if (
+                k == SampleBatch.INFOS
+                or should_flatten_state_key
+                or should_flatten_action_key
+            ):
+                if should_flatten_action_key:
+                    v = flatten_to_single_ndarray(v)
+                # Briefly remove dummy value to add to buffer
+                if k in self.data_cols_with_dummy_values:
+                    dummy = self.buffers[k][0].pop(-1)
+                self.buffers[k][0].append(v)
+                # Add back dummy value
+                if k in self.data_cols_with_dummy_values:
+                    self.buffers[k][0].append(dummy)
+            # Flatten all other columns.
+            else:
+                flattened = tree.flatten(v)
+                for i, sub_list in enumerate(self.buffers[k]):
+                    # Briefly remove dummy value to add to buffer
+                    if k in self.data_cols_with_dummy_values:
+                        dummy = sub_list.pop(-1)
+                    sub_list.append(flattened[i])
+                    # Add back dummy value
+                    if k in self.data_cols_with_dummy_values:
+                        sub_list.append(dummy)
+        # In inference mode, we don't need to keep all of trajectory in memory
+        # we only need to keep the steps required. We can pop from the beginning to
+        # create room for new data.
+        if not self.training:
+            for k in self.buffers:
+                for sub_list in self.buffers[k]:
+                    if sub_list:
+                        sub_list.pop(0)
+        self.agent_steps += 1
+    def build_for_inference(self) -> SampleBatch:
+        """During inference, we will build a SampleBatch with a batch size of 1 that
+        can then be used to run the forward pass of a policy. This data will only
+        include the enviornment context for running the policy at the last timestep.
+        Returns:
+            A SampleBatch with a batch size of 1.
+        """
+        batch_data = {}
+        np_data = {}
+        for view_col, view_req in self.view_requirements.items():
+            # Create the batch of data from the different buffers.
+            data_col = view_req.data_col or view_col
+            # if this view is not for inference, skip it.
+            if not view_req.used_for_compute_actions:
+                continue
+            if np.any(view_req.shift_arr > 0):
+                raise ValueError(
+                    f"During inference the agent can only use past observations to "
+                    f"respect causality. However, view_col = {view_col} seems to "
+                    f"depend on future indices {view_req.shift_arr}, while the "
+                    f"used_for_compute_actions flag is set to True. Please fix the "
+                    f"discrepancy. Hint: If you are using a custom model make sure "
+                    f"the view_requirements are initialized properly and is point "
+                    f"only refering to past timesteps during inference."
+                )
+            # Some columns don't exist yet
+            # (get created during postprocessing or depend on state_out).
+            if data_col not in self.buffers:
+                self._fill_buffer_with_initial_values(
+                    data_col, view_req, build_for_inference=True
+                )
+                self._prepare_for_data_cols_with_dummy_values(data_col)
+            # Keep an np-array cache, so we don't have to regenerate the
+            # np-array for different view_cols using to the same data_col.
+            self._cache_in_np(np_data, data_col)
+            data = []
+            for d in np_data[data_col]:
+                # if shift_arr = [0] the data will be just the last time step
+                # (len(d) - 1), if shift_arr = [-1] the data will be just the timestep
+                # before the last one (len(d) - 2) and so on.
+                element_at_t = d[view_req.shift_arr + len(d) - 1]
+                if element_at_t.shape[0] == 1:
+                    # We'd normally squeeze here to remove the time dim, but we'll
+                    # simply use the time dim as the batch dim.
+                    data.append(element_at_t)
+                    continue
+                # add the batch dimension with [None]
+                data.append(element_at_t[None])
+            # We unflatten even if data is empty here, because the structure might be
+            # nested with empty leafs and so we still need to reconstruct it.
+            # This is useful because we spec-check states in RLModules and these
+            # states can sometimes be nested dicts with empty leafs.
+            batch_data[view_col] = self._unflatten_as_buffer_struct(data, data_col)
+        batch = self._get_sample_batch(batch_data)
+        return batch
+    # TODO: @kouorsh we don't really need view_requirements anymore since it's already
+    # an attribute of the class
+    def build_for_training(
+        self, view_requirements: ViewRequirementsDict
+    ) -> SampleBatch:
+        """Builds a SampleBatch from the thus-far collected agent data.
+        If the episode/trajectory has no TERMINATED|TRUNCATED=True at the end, will
+        copy the necessary n timesteps at the end of the trajectory back to the
+        beginning of the buffers and wait for new samples coming in.
+        SampleBatches created by this method will be ready for postprocessing
+        by a Policy.
+        Args:
+            view_requirements: The viewrequirements dict needed to build the
+            SampleBatch from the raw buffers (which may have data shifts as well as
+            mappings from view-col to data-col in them).
+        Returns:
+            SampleBatch: The built SampleBatch for this agent, ready to go into
+            postprocessing.
+        """
+        batch_data = {}
+        np_data = {}
+        for view_col, view_req in view_requirements.items():
+            # Create the batch of data from the different buffers.
+            data_col = view_req.data_col or view_col
+            if data_col not in self.buffers:
+                is_state = self._fill_buffer_with_initial_values(
+                    data_col, view_req, build_for_inference=False
+                )
+                # We need to skip this view_col if it does not exist in the buffers and
+                # is not an RNN state because it could be the special keys that gets
+                # added by policy's postprocessing function for training.
+                if not is_state:
+                    continue
+            # OBS and INFOS are already shifted by -1 (the initial obs/info starts one
+            # ts before all other data columns).
+            obs_shift = -1 if data_col in [SampleBatch.OBS, SampleBatch.INFOS] else 0
+            # Keep an np-array cache so we don't have to regenerate the
+            # np-array for different view_cols using to the same data_col.
+            self._cache_in_np(np_data, data_col)
+            # Go through each time-step in the buffer and construct the view
+            # accordingly.
+            data = []
+            for d in np_data[data_col]:
+                shifted_data = []
+                # batch_repeat_value determines how many time steps should we skip
+                # before we repeat indexing the data.
+                # Example: batch_repeat_value=10, shift_arr = [-3, -2, -1],
+                # shift_before = 3
+                # buffer = [-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+                # resulting_data = [[-3, -2, -1], [7, 8, 9]]
+                # explanation: For t=0, we output [-3, -2, -1]. We then skip 10 time
+                # steps ahead and get to t=10. For t=10, we output [7, 8, 9]. We skip
+                # 10 more time steps and get to t=20. but since t=20 is out of bound we
+                # stop.
+                # count computes the number of time steps that we need to consider.
+                # if batch_repeat_value = 1, this number should be the length of
+                # episode so far, which is len(buffer) - shift_before (-1 if this
+                # value was gained during inference. This is because we keep a dummy
+                # value at the last position of the buffer that makes it one longer).
+                count = int(
+                    math.ceil(
+                        (
+                            len(d)
+                            - int(data_col in self.data_cols_with_dummy_values)
+                            - self.shift_before
+                        )
+                        / view_req.batch_repeat_value
+                    )
+                )
+                for i in range(count):
+                    # the indices for time step t
+                    inds = (
+                        self.shift_before
+                        + obs_shift
+                        + view_req.shift_arr
+                        + (i * view_req.batch_repeat_value)
+                    )
+                    # handle the case where the inds are out of bounds from the end.
+                    # if during the indexing any of the indices are out of bounds, we
+                    # need to use padding on the end to fill in the missing indices.
+                    # Create padding first time we encounter data
+                    if max(inds) < len(d):
+                        # Simple case where we can simply pick slices from buffer
+                        element_at_t = d[inds]
+                    else:
+                        # Case in which we have to pad because buffer has insufficient
+                        # length. This branch takes more time than simply picking
+                        # slices we try to avoid it.
+                        element_at_t = _get_buffered_slice_with_paddings(d, inds)
+                        element_at_t = np.stack(element_at_t)
+                    if element_at_t.shape[0] == 1:
+                        # Remove the T dimension if it is 1.
+                        element_at_t = element_at_t[0]
+                    shifted_data.append(element_at_t)
+                # in some multi-agent cases shifted_data may be an empty list.
+                # In this case we should just create an empty array and return it.
+                if shifted_data:
+                    shifted_data_np = np.stack(shifted_data, 0)
+                else:
+                    shifted_data_np = np.array(shifted_data)
+                data.append(shifted_data_np)
+            # We unflatten even if data is empty here, because the structure might be
+            # nested with empty leafs and so we still need to reconstruct it.
+            # This is useful because we spec-check states in RLModules and these
+            # states can sometimes be nested dicts with empty leafs.
+            batch_data[view_col] = self._unflatten_as_buffer_struct(data, data_col)
+        batch = self._get_sample_batch(batch_data)
+        # This trajectory is continuing -> Copy data at the end (in the size of
+        # self.shift_before) to the beginning of buffers and erase everything
+        # else.
+        if (
+            SampleBatch.TERMINATEDS in self.buffers
+            and not self.buffers[SampleBatch.TERMINATEDS][0][-1]
+            and SampleBatch.TRUNCATEDS in self.buffers
+            and not self.buffers[SampleBatch.TRUNCATEDS][0][-1]
+        ):
+            # Copy data to beginning of buffer and cut lists.
+            if self.shift_before > 0:
+                for k, data in self.buffers.items():
+                    # Loop through
+                    for i in range(len(data)):
+                        self.buffers[k][i] = data[i][-self.shift_before :]
+            self.agent_steps = 0
+        # Reset our unroll_id.
+        self.unroll_id = None
+        return batch
+    def _build_buffers(self, single_row: Dict[str, TensorType]) -> None:
+        """Builds the buffers for sample collection, given an example data row.
+        Args:
+            single_row (Dict[str, TensorType]): A single row (keys=column
+                names) of data to base the buffers on.
+        """
+        for col, data in single_row.items():
+            if col in self.buffers:
+                continue
+            shift = self.shift_before - (
+                1
+                if col
+                in [
+                    SampleBatch.OBS,
+                    SampleBatch.INFOS,
+                    SampleBatch.EPS_ID,
+                    SampleBatch.AGENT_INDEX,
+                    SampleBatch.ENV_ID,
+                    SampleBatch.T,
+                    SampleBatch.UNROLL_ID,
+                ]
+                else 0
+            )
+            # Store all data as flattened lists, except INFOS and state-out
+            # lists. These are monolithic items (infos is a dict that
+            # should not be further split, same for state-out items, which
+            # could be custom dicts as well).
+            should_flatten_action_key = (
+                col == SampleBatch.ACTIONS and not self.disable_action_flattening
+            )
+            # Note (Artur) RL Modules's states need no flattening
+            should_flatten_state_key = (
+                col.startswith("state_out") and not self._enable_new_api_stack
+            )
+            if (
+                col == SampleBatch.INFOS
+                or should_flatten_state_key
+                or should_flatten_action_key
+            ):
+                if should_flatten_action_key:
+                    data = flatten_to_single_ndarray(data)
+                self.buffers[col] = [[data for _ in range(shift)]]
+            else:
+                self.buffers[col] = [
+                    [v for _ in range(shift)] for v in tree.flatten(data)
+                ]
+                # Store an example data struct so we know, how to unflatten
+                # each data col.
+                self.buffer_structs[col] = data
+    def _get_sample_batch(self, batch_data: Dict[str, TensorType]) -> SampleBatch:
+        """Returns a SampleBatch from the given data dictionary. Also updates the
+        sequence information based on the max_seq_len."""
+        # Due to possible batch-repeats > 1, columns in the resulting batch
+        # may not all have the same batch size.
+        batch = SampleBatch(batch_data, is_training=self.training)
+        # Adjust the seq-lens array depending on the incoming agent sequences.
+        if self.is_policy_recurrent:
+            seq_lens = []
+            max_seq_len = self.max_seq_len
+            count = batch.count
+            while count > 0:
+                seq_lens.append(min(count, max_seq_len))
+                count -= max_seq_len
+            batch["seq_lens"] = np.array(seq_lens)
+            batch.max_seq_len = max_seq_len
+        return batch
+    def _cache_in_np(self, cache_dict: Dict[str, List[np.ndarray]], key: str) -> None:
+        """Caches the numpy version of the key in the buffer dict."""
+        if key not in cache_dict:
+            cache_dict[key] = [_to_float_np_array(d) for d in self.buffers[key]]
+    def _unflatten_as_buffer_struct(
+        self, data: List[np.ndarray], key: str
+    ) -> np.ndarray:
+        """Unflattens the given to match the buffer struct format for that key."""
+        if key not in self.buffer_structs:
+            return data[0]
+        return tree.unflatten_as(self.buffer_structs[key], data)
+    def _fill_buffer_with_initial_values(
+        self,
+        data_col: str,
+        view_requirement: ViewRequirement,
+        build_for_inference: bool = False,
+    ) -> bool:
+        """Fills the buffer with the initial values for the given data column.
+        for dat_col starting with `state_out`, use the initial states of the policy,
+        but for other data columns, create a dummy value based on the view requirement
+        space.
+        Args:
+            data_col: The data column to fill the buffer with.
+            view_requirement: The view requirement for the view_col. Normally the view
+                requirement for the data column is used and if it does not exist for
+                some reason the view requirement for view column is used instead.
+            build_for_inference: Whether this is getting called for inference or not.
+        returns:
+            is_state: True if the data_col is an RNN state, False otherwise.
+        """
+        try:
+            space = self.view_requirements[data_col].space
+        except KeyError:
+            space = view_requirement.space
+        # special treatment for state_out
+        # add them to the buffer in case they don't exist yet
+        is_state = True
+        if data_col.startswith("state_out"):
+            if self._enable_new_api_stack:
+                self._build_buffers({data_col: self.initial_states})
+            else:
+                if not self.is_policy_recurrent:
+                    raise ValueError(
+                        f"{data_col} is not available, because the given policy is"
+                        f"not recurrent according to the input model_inital_states."
+                        f"Have you forgotten to return non-empty lists in"
+                        f"policy.get_initial_states()?"
+                    )
+                state_ind = int(data_col.split("_")[-1])
+                self._build_buffers({data_col: self.initial_states[state_ind]})
+        else:
+            is_state = False
+            # only create dummy data during inference
+            if build_for_inference:
+                if isinstance(space, Space):
+                    #  state_out assumes the values do not have a batch dimension
+                    #  (i.e. instead of being (1, d) it is of shape (d,).
+                    fill_value = get_dummy_batch_for_space(
+                        space,
+                        batch_size=0,
+                    )
+                else:
+                    fill_value = space
+                self._build_buffers({data_col: fill_value})
+        return is_state
+    def _prepare_for_data_cols_with_dummy_values(self, data_col):
+        self.data_cols_with_dummy_values.add(data_col)
+        # For items gained during inference, we append a dummy value here so
+        # that view requirements viewing these is not shifted by 1
+        for b in self.buffers[data_col]:
+            b.append(b[-1])

.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/sample_collector.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import logging
+from abc import ABCMeta, abstractmethod
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from ray.rllib.policy.policy_map import PolicyMap
+from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.typing import AgentID, EnvID, EpisodeID, PolicyID, TensorType
+if TYPE_CHECKING:
+    from ray.rllib.callbacks.callbacks import RLlibCallback
+logger = logging.getLogger(__name__)
+# fmt: off
+# __sphinx_doc_begin__
+@OldAPIStack
+class SampleCollector(metaclass=ABCMeta):
+    """Collects samples for all policies and agents from a multi-agent env.
+    This API is controlled by RolloutWorker objects to store all data
+    generated by Environments and Policies/Models during rollout and
+    postprocessing. It's purposes are to a) make data collection and
+    SampleBatch/input_dict generation from this data faster, b) to unify
+    the way we collect samples from environments and model (outputs), thereby
+    allowing for possible user customizations, c) to allow for more complex
+    inputs fed into different policies (e.g. multi-agent case with inter-agent
+    communication channel).
+    """
+    def __init__(self,
+                 policy_map: PolicyMap,
+                 clip_rewards: Union[bool, float],
+                 callbacks: "RLlibCallback",
+                 multiple_episodes_in_batch: bool = True,
+                 rollout_fragment_length: int = 200,
+                 count_steps_by: str = "env_steps"):
+        """Initializes a SampleCollector instance.
+        Args:
+            policy_map: Maps policy ids to policy instances.
+            clip_rewards (Union[bool, float]): Whether to clip rewards before
+                postprocessing (at +/-1.0) or the actual value to +/- clip.
+            callbacks: RLlib callbacks.
+            multiple_episodes_in_batch: Whether it's allowed to pack
+                multiple episodes into the same built batch.
+            rollout_fragment_length: The
+        """
+        self.policy_map = policy_map
+        self.clip_rewards = clip_rewards
+        self.callbacks = callbacks
+        self.multiple_episodes_in_batch = multiple_episodes_in_batch
+        self.rollout_fragment_length = rollout_fragment_length
+        self.count_steps_by = count_steps_by
+    @abstractmethod
+    def add_init_obs(
+        self,
+        *,
+        episode,
+        agent_id: AgentID,
+        policy_id: PolicyID,
+        init_obs: TensorType,
+        init_infos: Optional[Dict[str, TensorType]] = None,
+        t: int = -1,
+    ) -> None:
+        """Adds an initial obs (after reset) to this collector.
+        Since the very first observation in an environment is collected w/o
+        additional data (w/o actions, w/o reward) after env.reset() is called,
+        this method initializes a new trajectory for a given agent.
+        `add_init_obs()` has to be called first for each agent/episode-ID
+        combination. After this, only `add_action_reward_next_obs()` must be
+        called for that same agent/episode-pair.
+        Args:
+            episode: The Episode, for which we
+                are adding an Agent's initial observation.
+            agent_id: Unique id for the agent we are adding
+                values for.
+            env_id: The environment index (in a vectorized setup).
+            policy_id: Unique id for policy controlling the agent.
+            init_obs: Initial observation (after env.reset()).
+            init_obs: Initial observation (after env.reset()).
+            init_infos: Initial infos dict (after env.reset()).
+            t: The time step (episode length - 1). The initial obs has
+                ts=-1(!), then an action/reward/next-obs at t=0, etc..
+        .. testcode::
+            :skipif: True
+            obs, infos = env.reset()
+            collector.add_init_obs(
+                episode=my_episode,
+                agent_id=0,
+                policy_id="pol0",
+                t=-1,
+                init_obs=obs,
+                init_infos=infos,
+            )
+            obs, r, terminated, truncated, info = env.step(action)
+            collector.add_action_reward_next_obs(12345, 0, "pol0", False, {
+                "action": action, "obs": obs, "reward": r, "terminated": terminated,
+                "truncated": truncated, "info": info
+            })
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def add_action_reward_next_obs(
+        self,
+        episode_id: EpisodeID,
+        agent_id: AgentID,
+        env_id: EnvID,
+        policy_id: PolicyID,
+        agent_done: bool,
+        values: Dict[str, TensorType],
+    ) -> None:
+        """Add the given dictionary (row) of values to this collector.
+        The incoming data (`values`) must include action, reward, terminated, truncated,
+        and next_obs information and may include any other information.
+        For the initial observation (after Env.reset()) of the given agent/
+        episode-ID combination, `add_initial_obs()` must be called instead.
+        Args:
+            episode_id: Unique id for the episode we are adding
+                values for.
+            agent_id: Unique id for the agent we are adding
+                values for.
+            env_id: The environment index (in a vectorized setup).
+            policy_id: Unique id for policy controlling the agent.
+            agent_done: Whether the given agent is done (terminated or truncated) with
+                its trajectory (the multi-agent episode may still be ongoing).
+            values (Dict[str, TensorType]): Row of values to add for this
+                agent. This row must contain the keys SampleBatch.ACTION,
+                REWARD, NEW_OBS, TERMINATED, and TRUNCATED.
+        .. testcode::
+            :skipif: True
+            obs, info = env.reset()
+            collector.add_init_obs(12345, 0, "pol0", obs)
+            obs, r, terminated, truncated, info = env.step(action)
+            collector.add_action_reward_next_obs(
+                12345,
+                0,
+                "pol0",
+                agent_done=False,
+                values={
+                    "action": action, "obs": obs, "reward": r,
+                    "terminated": terminated, "truncated": truncated
+                },
+            )
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def episode_step(self, episode) -> None:
+        """Increases the episode step counter (across all agents) by one.
+        Args:
+            episode: Episode we are stepping through.
+                Useful for handling counting b/c it is called once across
+                all agents that are inside this episode.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def total_env_steps(self) -> int:
+        """Returns total number of env-steps taken so far.
+        Thereby, a step in an N-agent multi-agent environment counts as only 1
+        for this metric. The returned count contains everything that has not
+        been built yet (and returned as MultiAgentBatches by the
+        `try_build_truncated_episode_multi_agent_batch` or
+        `postprocess_episode(build=True)` methods). After such build, this
+        counter is reset to 0.
+        Returns:
+            int: The number of env-steps taken in total in the environment(s)
+                so far.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def total_agent_steps(self) -> int:
+        """Returns total number of (individual) agent-steps taken so far.
+        Thereby, a step in an N-agent multi-agent environment counts as N.
+        If less than N agents have stepped (because some agents were not
+        required to send actions), the count will be increased by less than N.
+        The returned count contains everything that has not been built yet
+        (and returned as MultiAgentBatches by the
+        `try_build_truncated_episode_multi_agent_batch` or
+        `postprocess_episode(build=True)` methods). After such build, this
+        counter is reset to 0.
+        Returns:
+            int: The number of (individual) agent-steps taken in total in the
+                environment(s) so far.
+        """
+        raise NotImplementedError
+    # TODO(jungong) : Remove this API call once we completely move to
+    #  connector based sample collection.
+    @abstractmethod
+    def get_inference_input_dict(self, policy_id: PolicyID) -> \
+            Dict[str, TensorType]:
+        """Returns an input_dict for an (inference) forward pass from our data.
+        The input_dict can then be used for action computations inside a
+        Policy via `Policy.compute_actions_from_input_dict()`.
+        Args:
+            policy_id: The Policy ID to get the input dict for.
+        Returns:
+            Dict[str, TensorType]: The input_dict to be passed into the ModelV2
+                for inference/training.
+        .. testcode::
+            :skipif: True
+            obs, r, terminated, truncated, info = env.step(action)
+            collector.add_action_reward_next_obs(12345, 0, "pol0", False, {
+                "action": action, "obs": obs, "reward": r,
+                "terminated": terminated, "truncated", truncated
+            })
+            input_dict = collector.get_inference_input_dict(policy.model)
+            action = policy.compute_actions_from_input_dict(input_dict)
+            # repeat
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def postprocess_episode(
+        self,
+        episode,
+        is_done: bool = False,
+        check_dones: bool = False,
+        build: bool = False,
+    ) -> Optional[MultiAgentBatch]:
+        """Postprocesses all agents' trajectories in a given episode.
+        Generates (single-trajectory) SampleBatches for all Policies/Agents and
+        calls Policy.postprocess_trajectory on each of these. Postprocessing
+        may happens in-place, meaning any changes to the viewed data columns
+        are directly reflected inside this collector's buffers.
+        Also makes sure that additional (newly created) data columns are
+        correctly added to the buffers.
+        Args:
+            episode: The Episode object for which
+                to post-process data.
+            is_done: Whether the given episode is actually terminated
+                (all agents are terminated OR truncated). If True, the
+                episode will no longer be used/continued and we may need to
+                recycle/erase it internally. If a soft-horizon is hit, the
+                episode will continue to be used and `is_done` should be set
+                to False here.
+            check_dones: Whether we need to check that all agents'
+                trajectories have dones=True at the end.
+            build: Whether to build a MultiAgentBatch from the given
+                episode (and only that episode!) and return that
+                MultiAgentBatch. Used for batch_mode=`complete_episodes`.
+        Returns:
+            Optional[MultiAgentBatch]: If `build` is True, the
+                SampleBatch or MultiAgentBatch built from `episode` (either
+                just from that episde or from the `_PolicyCollectorGroup`
+                in the `episode.batch_builder` property).
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def try_build_truncated_episode_multi_agent_batch(self) -> \
+            List[Union[MultiAgentBatch, SampleBatch]]:
+        """Tries to build an MA-batch, if `rollout_fragment_length` is reached.
+        Any unprocessed data will be first postprocessed with a policy
+        postprocessor.
+        This is usually called to collect samples for policy training.
+        If not enough data has been collected yet (`rollout_fragment_length`),
+        returns an empty list.
+        Returns:
+            List[Union[MultiAgentBatch, SampleBatch]]: Returns a (possibly
+                empty) list of MultiAgentBatches (containing the accumulated
+                SampleBatches for each policy or a simple SampleBatch if only
+                one policy). The list will be empty if
+                `self.rollout_fragment_length` has not been reached yet.
+        """
+        raise NotImplementedError
+# __sphinx_doc_end__

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/complex_input_net.cpython-311.pyc ADDED Viewed

Binary file (9.81 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_modelv2.cpython-311.pyc ADDED Viewed

Binary file (4.45 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from ray.rllib.models.torch.modules.gru_gate import GRUGate
+from ray.rllib.models.torch.modules.multi_head_attention import MultiHeadAttention
+from ray.rllib.models.torch.modules.relative_multi_head_attention import (
+    RelativeMultiHeadAttention,
+)
+from ray.rllib.models.torch.modules.skip_connection import SkipConnection
+__all__ = [
+    "GRUGate",
+    "RelativeMultiHeadAttention",
+    "SkipConnection",
+    "MultiHeadAttention",
+]

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (695 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/gru_gate.cpython-311.pyc ADDED Viewed

Binary file (4.93 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/multi_head_attention.cpython-311.pyc ADDED Viewed

Binary file (4.13 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/noisy_layer.cpython-311.pyc ADDED Viewed

Binary file (6.18 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/relative_multi_head_attention.cpython-311.pyc ADDED Viewed

Binary file (9.77 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/skip_connection.cpython-311.pyc ADDED Viewed

Binary file (2.23 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/skip_connection.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType
+from typing import Optional
+torch, nn = try_import_torch()
+@OldAPIStack
+class SkipConnection(nn.Module):
+    """Skip connection layer.
+    Adds the original input to the output (regular residual layer) OR uses
+    input as hidden state input to a given fan_in_layer.
+    """
+    def __init__(
+        self, layer: nn.Module, fan_in_layer: Optional[nn.Module] = None, **kwargs
+    ):
+        """Initializes a SkipConnection nn Module object.
+        Args:
+            layer (nn.Module): Any layer processing inputs.
+            fan_in_layer (Optional[nn.Module]): An optional
+                layer taking two inputs: The original input and the output
+                of `layer`.
+        """
+        super().__init__(**kwargs)
+        self._layer = layer
+        self._fan_in_layer = fan_in_layer
+    def forward(self, inputs: TensorType, **kwargs) -> TensorType:
+        # del kwargs
+        outputs = self._layer(inputs, **kwargs)
+        # Residual case, just add inputs to outputs.
+        if self._fan_in_layer is None:
+            outputs = outputs + inputs
+        # Fan-in e.g. RNN: Call fan-in with `inputs` and `outputs`.
+        else:
+            # NOTE: In the GRU case, `inputs` is the state input.
+            outputs = self._fan_in_layer((inputs, outputs))
+        return outputs

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.37 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/d4rl_reader.cpython-311.pyc ADDED Viewed

Binary file (3.08 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/input_reader.cpython-311.pyc ADDED Viewed

Binary file (8.95 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/resource.cpython-311.pyc ADDED Viewed

Binary file (1.49 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/shuffled_input.cpython-311.pyc ADDED Viewed

Binary file (2.89 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from ray.rllib.offline.estimators.importance_sampling import ImportanceSampling
+from ray.rllib.offline.estimators.weighted_importance_sampling import (
+    WeightedImportanceSampling,
+)
+from ray.rllib.offline.estimators.direct_method import DirectMethod
+from ray.rllib.offline.estimators.doubly_robust import DoublyRobust
+from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
+__all__ = [
+    "OffPolicyEstimator",
+    "ImportanceSampling",
+    "WeightedImportanceSampling",
+    "DirectMethod",
+    "DoublyRobust",
+]

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/doubly_robust.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/feature_importance.cpython-311.pyc ADDED Viewed

Binary file (618 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/importance_sampling.cpython-311.pyc ADDED Viewed

Binary file (6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/weighted_importance_sampling.cpython-311.pyc ADDED Viewed

Binary file (9.14 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import logging
+from typing import Dict, Any, Optional, List
+import math
+import numpy as np
+from ray.data import Dataset
+from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
+from ray.rllib.offline.offline_evaluation_utils import compute_q_and_v_values
+from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+from ray.rllib.offline.estimators.fqe_torch_model import FQETorchModel
+from ray.rllib.policy import Policy
+from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import DeveloperAPI, override
+from ray.rllib.utils.typing import SampleBatchType
+from ray.rllib.utils.numpy import convert_to_numpy
+logger = logging.getLogger()
+@DeveloperAPI
+class DirectMethod(OffPolicyEstimator):
+    r"""The Direct Method estimator.
+    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.
+    This method trains a Q-model for the evaluation policy \pi_e on behavior
+    data generated by \pi_b. Currently, RLlib implements this using
+    Fitted-Q Evaluation (FQE). You can also implement your own model
+    and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`.
+    This estimator computes the expected return for \pi_e for an episode as:
+    V^{\pi_e}(s_0) = \sum_{a \in A} \pi_e(a | s_0) Q(s_0, a)
+    and returns the mean and standard deviation over episodes.
+    For more information refer to https://arxiv.org/pdf/1911.06854.pdf"""
+    @override(OffPolicyEstimator)
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float,
+        epsilon_greedy: float = 0.0,
+        q_model_config: Optional[Dict] = None,
+    ):
+        """Initializes a Direct Method OPE Estimator.
+        Args:
+            policy: Policy to evaluate.
+            gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+                policy during deployment. With 1-epsilon_greedy we act according the
+                target policy.
+            q_model_config: Arguments to specify the Q-model. Must specify
+                a `type` key pointing to the Q-model class.
+                This Q-model is trained in the train() method and is used
+                to compute the state-value estimates for the DirectMethod estimator.
+                It must implement `train` and `estimate_v`.
+                TODO (Rohan138): Unify this with RLModule API.
+        """
+        super().__init__(policy, gamma, epsilon_greedy)
+        # Some dummy policies and ones that are not based on a tensor framework
+        # backend can come without a config or without a framework key.
+        if hasattr(policy, "config"):
+            assert (
+                policy.config.get("framework", "torch") == "torch"
+            ), "Framework must be torch to use DirectMethod."
+        q_model_config = q_model_config or {}
+        model_cls = q_model_config.pop("type", FQETorchModel)
+        self.model = model_cls(
+            policy=policy,
+            gamma=gamma,
+            **q_model_config,
+        )
+        assert hasattr(
+            self.model, "estimate_v"
+        ), "self.model must implement `estimate_v`!"
+    @override(OffPolicyEstimator)
+    def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]:
+        estimates_per_epsiode = {}
+        rewards = episode["rewards"]
+        v_behavior = 0.0
+        for t in range(episode.count):
+            v_behavior += rewards[t] * self.gamma**t
+        v_target = self._compute_v_target(episode[:1])
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+        return estimates_per_epsiode
+    @override(OffPolicyEstimator)
+    def estimate_on_single_step_samples(
+        self, batch: SampleBatch
+    ) -> Dict[str, List[float]]:
+        estimates_per_epsiode = {}
+        rewards = batch["rewards"]
+        v_behavior = rewards
+        v_target = self._compute_v_target(batch)
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+        return estimates_per_epsiode
+    def _compute_v_target(self, init_step):
+        v_target = self.model.estimate_v(init_step)
+        v_target = convert_to_numpy(v_target)
+        return v_target
+    @override(OffPolicyEstimator)
+    def train(self, batch: SampleBatchType) -> Dict[str, Any]:
+        """Trains self.model on the given batch.
+        Args:
+            batch: A SampleBatchType to train on
+        Returns:
+            A dict with key "loss" and value as the mean training loss.
+        """
+        batch = convert_ma_batch_to_sample_batch(batch)
+        losses = self.model.train(batch)
+        return {"loss": np.mean(losses)}
+    @override(OfflineEvaluator)
+    def estimate_on_dataset(
+        self, dataset: Dataset, *, n_parallelism: int = ...
+    ) -> Dict[str, Any]:
+        """Calculates the Direct Method estimate on the given dataset.
+        Note: This estimate works for only discrete action spaces for now.
+        Args:
+            dataset: Dataset to compute the estimate on. Each record in dataset should
+                include the following columns: `obs`, `actions`, `action_prob` and
+                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
+            n_parallelism: The number of parallel workers to use.
+        Returns:
+            Dictionary with the following keys:
+                v_target: The estimated value of the target policy.
+                v_behavior: The estimated value of the behavior policy.
+                v_gain: The estimated gain of the target policy over the behavior
+                    policy.
+                v_std: The standard deviation of the estimated value of the target.
+        """
+        # compute v_values
+        batch_size = max(dataset.count() // n_parallelism, 1)
+        updated_ds = dataset.map_batches(
+            compute_q_and_v_values,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={
+                "model_class": self.model.__class__,
+                "model_state": self.model.get_state(),
+                "compute_q_values": False,
+            },
+        )
+        v_behavior = updated_ds.mean("rewards")
+        v_target = updated_ds.mean("v_values")
+        v_gain_mean = v_target / v_behavior
+        v_gain_ste = (
+            updated_ds.std("v_values") / v_behavior / math.sqrt(dataset.count())
+        )
+        return {
+            "v_behavior": v_behavior,
+            "v_target": v_target,
+            "v_gain_mean": v_gain_mean,
+            "v_gain_ste": v_gain_ste,
+        }

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import logging
+import numpy as np
+import math
+import pandas as pd
+from typing import Dict, Any, Optional, List
+from ray.data import Dataset
+from ray.rllib.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch, convert_ma_batch_to_sample_batch
+from ray.rllib.utils.annotations import DeveloperAPI, override
+from ray.rllib.utils.typing import SampleBatchType
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
+from ray.rllib.offline.estimators.fqe_torch_model import FQETorchModel
+from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+from ray.rllib.offline.offline_evaluation_utils import (
+    compute_is_weights,
+    compute_q_and_v_values,
+)
+logger = logging.getLogger()
+@DeveloperAPI
+class DoublyRobust(OffPolicyEstimator):
+    """The Doubly Robust estimator.
+    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.
+    This method trains a Q-model for the evaluation policy \pi_e on behavior
+    data generated by \pi_b. Currently, RLlib implements this using
+    Fitted-Q Evaluation (FQE). You can also implement your own model
+    and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`.
+    For behavior policy \pi_b and evaluation policy \pi_e, define the
+    cumulative importance ratio at timestep t as:
+    p_t = \sum_{t'=0}^t (\pi_e(a_{t'} | s_{t'}) / \pi_b(a_{t'} | s_{t'})).
+    Consider an episode with length T. Let V_T = 0.
+    For all t in {0, T - 1}, use the following recursive update:
+    V_t^DR = (\sum_{a \in A} \pi_e(a | s_t) Q(s_t, a))
+        + p_t * (r_t + \gamma * V_{t+1}^DR - Q(s_t, a_t))
+    This estimator computes the expected return for \pi_e for an episode as:
+    V^{\pi_e}(s_0) = V_0^DR
+    and returns the mean and standard deviation over episodes.
+    For more information refer to https://arxiv.org/pdf/1911.06854.pdf"""
+    @override(OffPolicyEstimator)
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float,
+        epsilon_greedy: float = 0.0,
+        normalize_weights: bool = True,
+        q_model_config: Optional[Dict] = None,
+    ):
+        """Initializes a Doubly Robust OPE Estimator.
+        Args:
+            policy: Policy to evaluate.
+            gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+                policy during deployment. With 1-epsilon_greedy we act
+                according the target policy.
+            normalize_weights: If True, the inverse propensity scores are normalized to
+                their sum across the entire dataset. The effect of this is similar to
+                weighted importance sampling compared to standard importance sampling.
+            q_model_config: Arguments to specify the Q-model. Must specify
+                a `type` key pointing to the Q-model class.
+                This Q-model is trained in the train() method and is used
+                to compute the state-value and Q-value estimates
+                for the DoublyRobust estimator.
+                It must implement `train`, `estimate_q`, and `estimate_v`.
+                TODO (Rohan138): Unify this with RLModule API.
+        """
+        super().__init__(policy, gamma, epsilon_greedy)
+        q_model_config = q_model_config or {}
+        q_model_config["gamma"] = gamma
+        self._model_cls = q_model_config.pop("type", FQETorchModel)
+        self._model_configs = q_model_config
+        self._normalize_weights = normalize_weights
+        self.model = self._model_cls(
+            policy=policy,
+            **q_model_config,
+        )
+        assert hasattr(
+            self.model, "estimate_v"
+        ), "self.model must implement `estimate_v`!"
+        assert hasattr(
+            self.model, "estimate_q"
+        ), "self.model must implement `estimate_q`!"
+    @override(OffPolicyEstimator)
+    def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]:
+        estimates_per_epsiode = {}
+        rewards, old_prob = episode["rewards"], episode["action_prob"]
+        new_prob = self.compute_action_probs(episode)
+        weight = new_prob / old_prob
+        v_behavior = 0.0
+        v_target = 0.0
+        q_values = self.model.estimate_q(episode)
+        q_values = convert_to_numpy(q_values)
+        v_values = self.model.estimate_v(episode)
+        v_values = convert_to_numpy(v_values)
+        assert q_values.shape == v_values.shape == (episode.count,)
+        for t in reversed(range(episode.count)):
+            v_behavior = rewards[t] + self.gamma * v_behavior
+            v_target = v_values[t] + weight[t] * (
+                rewards[t] + self.gamma * v_target - q_values[t]
+            )
+        v_target = v_target.item()
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+        return estimates_per_epsiode
+    @override(OffPolicyEstimator)
+    def estimate_on_single_step_samples(
+        self, batch: SampleBatch
+    ) -> Dict[str, List[float]]:
+        estimates_per_epsiode = {}
+        rewards, old_prob = batch["rewards"], batch["action_prob"]
+        new_prob = self.compute_action_probs(batch)
+        q_values = self.model.estimate_q(batch)
+        q_values = convert_to_numpy(q_values)
+        v_values = self.model.estimate_v(batch)
+        v_values = convert_to_numpy(v_values)
+        v_behavior = rewards
+        weight = new_prob / old_prob
+        v_target = v_values + weight * (rewards - q_values)
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+        return estimates_per_epsiode
+    @override(OffPolicyEstimator)
+    def train(self, batch: SampleBatchType) -> Dict[str, Any]:
+        """Trains self.model on the given batch.
+        Args:
+        batch: A SampleBatch or MultiAgentbatch to train on
+        Returns:
+            A dict with key "loss" and value as the mean training loss.
+        """
+        batch = convert_ma_batch_to_sample_batch(batch)
+        losses = self.model.train(batch)
+        return {"loss": np.mean(losses)}
+    @override(OfflineEvaluator)
+    def estimate_on_dataset(
+        self, dataset: Dataset, *, n_parallelism: int = ...
+    ) -> Dict[str, Any]:
+        """Estimates the policy value using the Doubly Robust estimator.
+        The doubly robust estimator uses normalization of importance sampling weights
+        (aka. propensity ratios) to the average of the importance weights across the
+        entire dataset. This is done to reduce the variance of the estimate (similar to
+        weighted importance sampling). You can disable this by setting
+        `normalize_weights=False` in the constructor.
+        Note: This estimate works for only discrete action spaces for now.
+        Args:
+            dataset: Dataset to compute the estimate on. Each record in dataset should
+                include the following columns: `obs`, `actions`, `action_prob` and
+                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
+            n_parallelism: Number of parallelism to use for the computation.
+        Returns:
+            A dict with the following keys:
+                v_target: The estimated value of the target policy.
+                v_behavior: The estimated value of the behavior policy.
+                v_gain: The estimated gain of the target policy over the behavior
+                    policy.
+                v_std: The standard deviation of the estimated value of the target.
+        """
+        # step 1: compute the weights and weighted rewards
+        batch_size = max(dataset.count() // n_parallelism, 1)
+        updated_ds = dataset.map_batches(
+            compute_is_weights,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={
+                "policy_state": self.policy.get_state(),
+                "estimator_class": self.__class__,
+            },
+        )
+        # step 2: compute q_values and v_values
+        batch_size = max(updated_ds.count() // n_parallelism, 1)
+        updated_ds = updated_ds.map_batches(
+            compute_q_and_v_values,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={
+                "model_class": self.model.__class__,
+                "model_state": self.model.get_state(),
+            },
+        )
+        # step 3: compute the v_target
+        def compute_v_target(batch: pd.DataFrame, normalizer: float = 1.0):
+            weights = batch["weights"] / normalizer
+            batch["v_target"] = batch["v_values"] + weights * (
+                batch["rewards"] - batch["q_values"]
+            )
+            batch["v_behavior"] = batch["rewards"]
+            return batch
+        normalizer = updated_ds.mean("weights") if self._normalize_weights else 1.0
+        updated_ds = updated_ds.map_batches(
+            compute_v_target,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={"normalizer": normalizer},
+        )
+        v_behavior = updated_ds.mean("v_behavior")
+        v_target = updated_ds.mean("v_target")
+        v_gain_mean = v_target / v_behavior
+        v_gain_ste = (
+            updated_ds.std("v_target")
+            / normalizer
+            / v_behavior
+            / math.sqrt(dataset.count())
+        )
+        return {
+            "v_behavior": v_behavior,
+            "v_target": v_target,
+            "v_gain_mean": v_gain_mean,
+            "v_gain_ste": v_gain_ste,
+        }

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/fqe_torch_model.py ADDED Viewed

	@@ -0,0 +1,297 @@

+from typing import Dict, Any
+from ray.rllib.models.utils import get_initializer
+from ray.rllib.policy import Policy
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.annotations import is_overridden
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+from gymnasium.spaces import Discrete
+torch, nn = try_import_torch()
+# TODO: Create a config object for FQE and unify it with the RLModule API
+@DeveloperAPI
+class FQETorchModel:
+    """Pytorch implementation of the Fitted Q-Evaluation (FQE) model from
+    https://arxiv.org/abs/1911.06854
+    """
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float,
+        model_config: ModelConfigDict = None,
+        n_iters: int = 1,
+        lr: float = 1e-3,
+        min_loss_threshold: float = 1e-4,
+        clip_grad_norm: float = 100.0,
+        minibatch_size: int = None,
+        polyak_coef: float = 1.0,
+    ) -> None:
+        """
+        Args:
+            policy: Policy to evaluate.
+            gamma: Discount factor of the environment.
+            model_config: The ModelConfigDict for self.q_model, defaults to:
+                {
+                    "fcnet_hiddens": [8, 8],
+                    "fcnet_activation": "relu",
+                    "vf_share_layers": True,
+                },
+            n_iters: Number of gradient steps to run on batch, defaults to 1
+            lr: Learning rate for Adam optimizer
+            min_loss_threshold: Early stopping if mean loss < min_loss_threshold
+            clip_grad_norm: Clip loss gradients to this maximum value
+            minibatch_size: Minibatch size for training Q-function;
+                if None, train on the whole batch
+            polyak_coef: Polyak averaging factor for target Q-function
+        """
+        self.policy = policy
+        assert isinstance(
+            policy.action_space, Discrete
+        ), f"{self.__class__.__name__} only supports discrete action spaces!"
+        self.gamma = gamma
+        self.observation_space = policy.observation_space
+        self.action_space = policy.action_space
+        if model_config is None:
+            model_config = {
+                "fcnet_hiddens": [32, 32, 32],
+                "fcnet_activation": "relu",
+                "vf_share_layers": True,
+            }
+        self.model_config = model_config
+        self.device = self.policy.device
+        self.q_model: TorchModelV2 = ModelCatalog.get_model_v2(
+            self.observation_space,
+            self.action_space,
+            self.action_space.n,
+            model_config,
+            framework="torch",
+            name="TorchQModel",
+        ).to(self.device)
+        self.target_q_model: TorchModelV2 = ModelCatalog.get_model_v2(
+            self.observation_space,
+            self.action_space,
+            self.action_space.n,
+            model_config,
+            framework="torch",
+            name="TargetTorchQModel",
+        ).to(self.device)
+        self.n_iters = n_iters
+        self.lr = lr
+        self.min_loss_threshold = min_loss_threshold
+        self.clip_grad_norm = clip_grad_norm
+        self.minibatch_size = minibatch_size
+        self.polyak_coef = polyak_coef
+        self.optimizer = torch.optim.Adam(self.q_model.variables(), self.lr)
+        initializer = get_initializer("xavier_uniform", framework="torch")
+        # Hard update target
+        self.update_target(polyak_coef=1.0)
+        def f(m):
+            if isinstance(m, nn.Linear):
+                initializer(m.weight)
+        self.initializer = f
+    def train(self, batch: SampleBatch) -> TensorType:
+        """Trains self.q_model using FQE loss on given batch.
+        Args:
+            batch: A SampleBatch of episodes to train on
+        Returns:
+            A list of losses for each training iteration
+        """
+        losses = []
+        minibatch_size = self.minibatch_size or batch.count
+        # Copy batch for shuffling
+        batch = batch.copy(shallow=True)
+        for _ in range(self.n_iters):
+            minibatch_losses = []
+            batch.shuffle()
+            for idx in range(0, batch.count, minibatch_size):
+                minibatch = batch[idx : idx + minibatch_size]
+                obs = torch.tensor(minibatch[SampleBatch.OBS], device=self.device)
+                actions = torch.tensor(
+                    minibatch[SampleBatch.ACTIONS],
+                    device=self.device,
+                    dtype=int,
+                )
+                rewards = torch.tensor(
+                    minibatch[SampleBatch.REWARDS], device=self.device
+                )
+                next_obs = torch.tensor(
+                    minibatch[SampleBatch.NEXT_OBS], device=self.device
+                )
+                dones = torch.tensor(
+                    minibatch[SampleBatch.TERMINATEDS], device=self.device, dtype=float
+                )
+                # Compute Q-values for current obs
+                q_values, _ = self.q_model({"obs": obs}, [], None)
+                q_acts = torch.gather(q_values, -1, actions.unsqueeze(-1)).squeeze(-1)
+                next_action_probs = self._compute_action_probs(next_obs)
+                # Compute Q-values for next obs
+                with torch.no_grad():
+                    next_q_values, _ = self.target_q_model({"obs": next_obs}, [], None)
+                # Compute estimated state value next_v = E_{a ~ pi(s)} [Q(next_obs,a)]
+                next_v = torch.sum(next_q_values * next_action_probs, axis=-1)
+                targets = rewards + (1 - dones) * self.gamma * next_v
+                loss = (targets - q_acts) ** 2
+                loss = torch.mean(loss)
+                self.optimizer.zero_grad()
+                loss.backward()
+                nn.utils.clip_grad.clip_grad_norm_(
+                    self.q_model.variables(), self.clip_grad_norm
+                )
+                self.optimizer.step()
+                minibatch_losses.append(loss.item())
+            iter_loss = sum(minibatch_losses) / len(minibatch_losses)
+            losses.append(iter_loss)
+            if iter_loss < self.min_loss_threshold:
+                break
+            self.update_target()
+        return losses
+    def estimate_q(self, batch: SampleBatch) -> TensorType:
+        obs = torch.tensor(batch[SampleBatch.OBS], device=self.device)
+        with torch.no_grad():
+            q_values, _ = self.q_model({"obs": obs}, [], None)
+        actions = torch.tensor(
+            batch[SampleBatch.ACTIONS], device=self.device, dtype=int
+        )
+        q_values = torch.gather(q_values, -1, actions.unsqueeze(-1)).squeeze(-1)
+        return q_values
+    def estimate_v(self, batch: SampleBatch) -> TensorType:
+        obs = torch.tensor(batch[SampleBatch.OBS], device=self.device)
+        with torch.no_grad():
+            q_values, _ = self.q_model({"obs": obs}, [], None)
+        # Compute pi(a | s) for each action a in policy.action_space
+        action_probs = self._compute_action_probs(obs)
+        v_values = torch.sum(q_values * action_probs, axis=-1)
+        return v_values
+    def update_target(self, polyak_coef=None):
+        # Update_target will be called periodically to copy Q network to
+        # target Q network, using (soft) polyak_coef-synching.
+        polyak_coef = polyak_coef or self.polyak_coef
+        model_state_dict = self.q_model.state_dict()
+        # Support partial (soft) synching.
+        # If polyak_coef == 1.0: Full sync from Q-model to target Q-model.
+        target_state_dict = self.target_q_model.state_dict()
+        model_state_dict = {
+            k: polyak_coef * model_state_dict[k] + (1 - polyak_coef) * v
+            for k, v in target_state_dict.items()
+        }
+        self.target_q_model.load_state_dict(model_state_dict)
+    def _compute_action_probs(self, obs: TensorType) -> TensorType:
+        """Compute action distribution over the action space.
+        Args:
+            obs: A tensor of observations of shape (batch_size * obs_dim)
+        Returns:
+            action_probs: A tensor of action probabilities
+            of shape (batch_size * action_dim)
+        """
+        input_dict = {SampleBatch.OBS: obs}
+        seq_lens = torch.ones(len(obs), device=self.device, dtype=int)
+        state_batches = []
+        if is_overridden(self.policy.action_distribution_fn):
+            try:
+                # TorchPolicyV2 function signature
+                dist_inputs, dist_class, _ = self.policy.action_distribution_fn(
+                    self.policy.model,
+                    obs_batch=input_dict,
+                    state_batches=state_batches,
+                    seq_lens=seq_lens,
+                    explore=False,
+                    is_training=False,
+                )
+            except TypeError:
+                # TorchPolicyV1 function signature for compatibility with DQN
+                # TODO: Remove this once DQNTorchPolicy is migrated to PolicyV2
+                dist_inputs, dist_class, _ = self.policy.action_distribution_fn(
+                    self.policy,
+                    self.policy.model,
+                    input_dict=input_dict,
+                    state_batches=state_batches,
+                    seq_lens=seq_lens,
+                    explore=False,
+                    is_training=False,
+                )
+        else:
+            dist_class = self.policy.dist_class
+            dist_inputs, _ = self.policy.model(input_dict, state_batches, seq_lens)
+        action_dist = dist_class(dist_inputs, self.policy.model)
+        assert isinstance(
+            action_dist.dist, torch.distributions.categorical.Categorical
+        ), "FQE only supports Categorical or MultiCategorical distributions!"
+        action_probs = action_dist.dist.probs
+        return action_probs
+    def get_state(self) -> Dict[str, Any]:
+        """Returns the current state of the FQE Model."""
+        return {
+            "policy_state": self.policy.get_state(),
+            "model_config": self.model_config,
+            "n_iters": self.n_iters,
+            "lr": self.lr,
+            "min_loss_threshold": self.min_loss_threshold,
+            "clip_grad_norm": self.clip_grad_norm,
+            "minibatch_size": self.minibatch_size,
+            "polyak_coef": self.polyak_coef,
+            "gamma": self.gamma,
+            "q_model_state": self.q_model.state_dict(),
+            "target_q_model_state": self.target_q_model.state_dict(),
+        }
+    def set_state(self, state: Dict[str, Any]) -> None:
+        """Sets the current state of the FQE Model.
+        Args:
+            state: A state dict returned by `get_state()`.
+        """
+        self.n_iters = state["n_iters"]
+        self.lr = state["lr"]
+        self.min_loss_threshold = state["min_loss_threshold"]
+        self.clip_grad_norm = state["clip_grad_norm"]
+        self.minibatch_size = state["minibatch_size"]
+        self.polyak_coef = state["polyak_coef"]
+        self.gamma = state["gamma"]
+        self.policy.set_state(state["policy_state"])
+        self.q_model.load_state_dict(state["q_model_state"])
+        self.target_q_model.load_state_dict(state["target_q_model_state"])
+    @classmethod
+    def from_state(cls, state: Dict[str, Any]) -> "FQETorchModel":
+        """Creates a FQE Model from a state dict.
+        Args:
+            state: A state dict returned by `get_state`.
+        Returns:
+            An instance of the FQETorchModel.
+        """
+        policy = Policy.from_state(state["policy_state"])
+        model = cls(
+            policy=policy, gamma=state["gamma"], model_config=state["model_config"]
+        )
+        model.set_state(state)
+        return model

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/importance_sampling.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from typing import Dict, List, Any
+import math
+from ray.data import Dataset
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+from ray.rllib.offline.offline_evaluation_utils import (
+    remove_time_dim,
+    compute_is_weights,
+)
+from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
+from ray.rllib.policy.sample_batch import SampleBatch
+@DeveloperAPI
+class ImportanceSampling(OffPolicyEstimator):
+    r"""The step-wise IS estimator.
+    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.
+    For behavior policy \pi_b and evaluation policy \pi_e, define the
+    cumulative importance ratio at timestep t as:
+    p_t = \sum_{t'=0}^t (\pi_e(a_{t'} | s_{t'}) / \pi_b(a_{t'} | s_{t'})).
+    This estimator computes the expected return for \pi_e for an episode as:
+    V^{\pi_e}(s_0) = \sum_t \gamma ^ {t} * p_t * r_t
+    and returns the mean and standard deviation over episodes.
+    For more information refer to https://arxiv.org/pdf/1911.06854.pdf"""
+    @override(OffPolicyEstimator)
+    def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, float]:
+        estimates_per_epsiode = {}
+        rewards, old_prob = episode["rewards"], episode["action_prob"]
+        new_prob = self.compute_action_probs(episode)
+        # calculate importance ratios
+        p = []
+        for t in range(episode.count):
+            if t == 0:
+                pt_prev = 1.0
+            else:
+                pt_prev = p[t - 1]
+            p.append(pt_prev * new_prob[t] / old_prob[t])
+        # calculate stepwise IS estimate
+        v_behavior = 0.0
+        v_target = 0.0
+        for t in range(episode.count):
+            v_behavior += rewards[t] * self.gamma**t
+            v_target += p[t] * rewards[t] * self.gamma**t
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+        return estimates_per_epsiode
+    @override(OffPolicyEstimator)
+    def estimate_on_single_step_samples(
+        self, batch: SampleBatch
+    ) -> Dict[str, List[float]]:
+        estimates_per_epsiode = {}
+        rewards, old_prob = batch["rewards"], batch["action_prob"]
+        new_prob = self.compute_action_probs(batch)
+        weights = new_prob / old_prob
+        v_behavior = rewards
+        v_target = weights * rewards
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+        return estimates_per_epsiode
+    @override(OfflineEvaluator)
+    def estimate_on_dataset(
+        self, dataset: Dataset, *, n_parallelism: int = ...
+    ) -> Dict[str, Any]:
+        """Computes the Importance sampling estimate on the given dataset.
+        Note: This estimate works for both continuous and discrete action spaces.
+        Args:
+            dataset: Dataset to compute the estimate on. Each record in dataset should
+                include the following columns: `obs`, `actions`, `action_prob` and
+                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
+            n_parallelism: The number of parallel workers to use.
+        Returns:
+            A dictionary containing the following keys:
+                v_target: The estimated value of the target policy.
+                v_behavior: The estimated value of the behavior policy.
+                v_gain_mean: The mean of the gain of the target policy over the
+                    behavior policy.
+                v_gain_ste: The standard error of the gain of the target policy over
+                    the behavior policy.
+        """
+        batch_size = max(dataset.count() // n_parallelism, 1)
+        dataset = dataset.map_batches(
+            remove_time_dim, batch_size=batch_size, batch_format="pandas"
+        )
+        updated_ds = dataset.map_batches(
+            compute_is_weights,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={
+                "policy_state": self.policy.get_state(),
+                "estimator_class": self.__class__,
+            },
+        )
+        v_target = updated_ds.mean("weighted_rewards")
+        v_behavior = updated_ds.mean("rewards")
+        v_gain_mean = v_target / v_behavior
+        v_gain_ste = (
+            updated_ds.std("weighted_rewards") / v_behavior / math.sqrt(dataset.count())
+        )
+        return {
+            "v_target": v_target,
+            "v_behavior": v_behavior,
+            "v_gain_mean": v_gain_mean,
+            "v_gain_ste": v_gain_ste,
+        }

.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/off_policy_estimator.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import gymnasium as gym
+import numpy as np
+import tree
+from typing import Dict, Any, List
+import logging
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy import Policy
+from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch
+from ray.rllib.utils.policy import compute_log_likelihoods_from_input_dict
+from ray.rllib.utils.annotations import (
+    DeveloperAPI,
+    ExperimentalAPI,
+    OverrideToImplementCustomLogic,
+)
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import TensorType, SampleBatchType
+from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class OffPolicyEstimator(OfflineEvaluator):
+    """Interface for an off policy estimator for counterfactual evaluation."""
+    @DeveloperAPI
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float = 0.0,
+        epsilon_greedy: float = 0.0,
+    ):
+        """Initializes an OffPolicyEstimator instance.
+        Args:
+            policy: Policy to evaluate.
+            gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+            policy during deployment. With 1-epsilon_greedy we act according the target
+            policy.
+            # TODO (kourosh): convert the input parameters to a config dict.
+        """
+        super().__init__(policy)
+        self.gamma = gamma
+        self.epsilon_greedy = epsilon_greedy
+    @DeveloperAPI
+    def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]:
+        """Returns off-policy estimates for the given one episode.
+        Args:
+            batch: The episode to calculate the off-policy estimates (OPE) on. The
+            episode must be a sample batch type that contains the fields "obs",
+            "actions", and "action_prob" and it needs to represent a
+            complete trajectory.
+        Returns:
+            The off-policy estimates (OPE) calculated on the given episode. The returned
+            dict can be any arbitrary mapping of strings to metrics.
+        """
+        raise NotImplementedError
+    @DeveloperAPI
+    def estimate_on_single_step_samples(
+        self,
+        batch: SampleBatch,
+    ) -> Dict[str, List[float]]:
+        """Returns off-policy estimates for the batch of single timesteps. This is
+        highly optimized for bandits assuming each episode is a single timestep.
+        Args:
+            batch: The batch to calculate the off-policy estimates (OPE) on. The
+            batch must be a sample batch type that contains the fields "obs",
+            "actions", and "action_prob".
+        Returns:
+            The off-policy estimates (OPE) calculated on the given batch of single time
+            step samples. The returned dict can be any arbitrary mapping of strings to
+            a list of floats capturing the values per each record.
+        """
+        raise NotImplementedError
+    def on_before_split_batch_by_episode(
+        self, sample_batch: SampleBatch
+    ) -> SampleBatch:
+        """Called before the batch is split by episode. You can perform any
+        preprocessing on the batch that you want here.
+        e.g. adding done flags to the batch, or reseting some stats that you want to
+        track per episode later during estimation, .etc.
+        Args:
+            sample_batch: The batch to split by episode. This contains multiple
+            episodes.
+        Returns:
+            The modified batch before calling split_by_episode().
+        """
+        return sample_batch
+    @OverrideToImplementCustomLogic
+    def on_after_split_batch_by_episode(
+        self, all_episodes: List[SampleBatch]
+    ) -> List[SampleBatch]:
+        """Called after the batch is split by episode. You can perform any
+        postprocessing on each episode that you want here.
+        e.g. computing advantage per episode, .etc.
+        Args:
+            all_episodes: The list of episodes in the original batch. Each element is a
+            sample batch type that is a single episode.
+        """
+        return all_episodes
+    @OverrideToImplementCustomLogic
+    def peek_on_single_episode(self, episode: SampleBatch) -> None:
+        """This is called on each episode before it is passed to
+        estimate_on_single_episode(). Using this method, you can get a peek at the
+        entire validation dataset before runnining the estimation. For examlpe if you
+        need to perform any normalizations of any sorts on the dataset, you can compute
+        the normalization parameters here.
+        Args:
+            episode: The episode that is split from the original batch. This is a
+            sample batch type that is a single episode.
+        """
+        pass
+    @DeveloperAPI
+    def estimate(
+        self, batch: SampleBatchType, split_batch_by_episode: bool = True
+    ) -> Dict[str, Any]:
+        """Compute off-policy estimates.
+        Args:
+            batch: The batch to calculate the off-policy estimates (OPE) on. The
+            batch must contain the fields "obs", "actions", and "action_prob".
+            split_batch_by_episode: Whether to split the batch by episode.
+        Returns:
+            The off-policy estimates (OPE) calculated on the given batch. The returned
+            dict can be any arbitrary mapping of strings to metrics.
+            The dict consists of the following metrics:
+            - v_behavior: The discounted return averaged over episodes in the batch
+            - v_behavior_std: The standard deviation corresponding to v_behavior
+            - v_target: The estimated discounted return for `self.policy`,
+            averaged over episodes in the batch
+            - v_target_std: The standard deviation corresponding to v_target
+            - v_gain: v_target / max(v_behavior, 1e-8)
+            - v_delta: The difference between v_target and v_behavior.
+        """
+        batch = convert_ma_batch_to_sample_batch(batch)
+        self.check_action_prob_in_batch(batch)
+        estimates_per_epsiode = []
+        if split_batch_by_episode:
+            batch = self.on_before_split_batch_by_episode(batch)
+            all_episodes = batch.split_by_episode()
+            all_episodes = self.on_after_split_batch_by_episode(all_episodes)
+            for episode in all_episodes:
+                assert len(set(episode[SampleBatch.EPS_ID])) == 1, (
+                    "The episode must contain only one episode id. For some reason "
+                    "the split_by_episode() method could not successfully split "
+                    "the batch by episodes. Each row in the dataset should be "
+                    "one episode. Check your evaluation dataset for errors."
+                )
+                self.peek_on_single_episode(episode)
+            for episode in all_episodes:
+                estimate_step_results = self.estimate_on_single_episode(episode)
+                estimates_per_epsiode.append(estimate_step_results)
+            # turn a list of identical dicts into a dict of lists
+            estimates_per_epsiode = tree.map_structure(
+                lambda *x: list(x), *estimates_per_epsiode
+            )
+        else:
+            # the returned dict is a mapping of strings to a list of floats
+            estimates_per_epsiode = self.estimate_on_single_step_samples(batch)
+        estimates = {
+            "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
+            "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
+            "v_target": np.mean(estimates_per_epsiode["v_target"]),
+            "v_target_std": np.std(estimates_per_epsiode["v_target"]),
+        }
+        estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
+        estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
+        return estimates
+    @DeveloperAPI
+    def check_action_prob_in_batch(self, batch: SampleBatchType) -> None:
+        """Checks if we support off policy estimation (OPE) on given batch.
+        Args:
+            batch: The batch to check.
+        Raises:
+            ValueError: In case `action_prob` key is not in batch
+        """
+        if "action_prob" not in batch:
+            raise ValueError(
+                "Off-policy estimation is not possible unless the inputs "
+                "include action probabilities (i.e., the policy is stochastic "
+                "and emits the 'action_prob' key). For DQN this means using "
+                "`exploration_config: {type: 'SoftQ'}`. You can also set "
+                "`off_policy_estimation_methods: {}` to disable estimation."
+            )
+    @ExperimentalAPI
+    def compute_action_probs(self, batch: SampleBatch):
+        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch)
+        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+        if self.epsilon_greedy > 0.0:
+            if not isinstance(self.policy.action_space, gym.spaces.Discrete):
+                raise ValueError(
+                    "Evaluation with epsilon-greedy exploration is only supported "
+                    "with discrete action spaces."
+                )
+            eps = self.epsilon_greedy
+            new_prob = new_prob * (1 - eps) + eps / self.policy.action_space.n
+        return new_prob
+    @DeveloperAPI
+    def train(self, batch: SampleBatchType) -> Dict[str, Any]:
+        """Train a model for Off-Policy Estimation.
+        Args:
+            batch: SampleBatch to train on
+        Returns:
+            Any optional metrics to return from the estimator
+        """
+        return {}
+    @Deprecated(
+        old="OffPolicyEstimator.action_log_likelihood",
+        new="ray.rllib.utils.policy.compute_log_likelihoods_from_input_dict",
+        error=True,
+    )
+    def action_log_likelihood(self, batch: SampleBatchType) -> TensorType:
+        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch)
+        return convert_to_numpy(log_likelihoods)

.venv/lib/python3.11/site-packages/ray/rllib/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import contextlib
+from functools import partial
+from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.rllib.utils.filter import Filter
+from ray.rllib.utils.filter_manager import FilterManager
+from ray.rllib.utils.framework import (
+    try_import_jax,
+    try_import_tf,
+    try_import_tfp,
+    try_import_torch,
+)
+from ray.rllib.utils.numpy import (
+    sigmoid,
+    softmax,
+    relu,
+    one_hot,
+    fc,
+    lstm,
+    SMALL_NUMBER,
+    LARGE_INTEGER,
+    MIN_LOG_NN_OUTPUT,
+    MAX_LOG_NN_OUTPUT,
+)
+from ray.rllib.utils.schedules import (
+    LinearSchedule,
+    PiecewiseSchedule,
+    PolynomialSchedule,
+    ExponentialSchedule,
+    ConstantSchedule,
+)
+from ray.rllib.utils.test_utils import (
+    check,
+    check_compute_single_action,
+    check_train_results,
+)
+from ray.tune.utils import merge_dicts, deep_update
+@DeveloperAPI
+def add_mixins(base, mixins, reversed=False):
+    """Returns a new class with mixins applied in priority order."""
+    mixins = list(mixins or [])
+    while mixins:
+        if reversed:
+            class new_base(base, mixins.pop()):
+                pass
+        else:
+            class new_base(mixins.pop(), base):
+                pass
+        base = new_base
+    return base
+@DeveloperAPI
+def force_list(elements=None, to_tuple=False):
+    """
+    Makes sure `elements` is returned as a list, whether `elements` is a single
+    item, already a list, or a tuple.
+    Args:
+        elements (Optional[any]): The inputs as single item, list, or tuple to
+            be converted into a list/tuple. If None, returns empty list/tuple.
+        to_tuple: Whether to use tuple (instead of list).
+    Returns:
+        Union[list,tuple]: All given elements in a list/tuple depending on
+            `to_tuple`'s value. If elements is None,
+            returns an empty list/tuple.
+    """
+    ctor = list
+    if to_tuple is True:
+        ctor = tuple
+    return (
+        ctor()
+        if elements is None
+        else ctor(elements)
+        if type(elements) in [list, set, tuple]
+        else ctor([elements])
+    )
+@DeveloperAPI
+class NullContextManager(contextlib.AbstractContextManager):
+    """No-op context manager"""
+    def __init__(self):
+        pass
+    def __enter__(self):
+        pass
+    def __exit__(self, *args):
+        pass
+force_tuple = partial(force_list, to_tuple=True)
+__all__ = [
+    "add_mixins",
+    "check",
+    "check_compute_single_action",
+    "check_train_results",
+    "deep_update",
+    "deprecation_warning",
+    "fc",
+    "force_list",
+    "force_tuple",
+    "lstm",
+    "merge_dicts",
+    "one_hot",
+    "override",
+    "relu",
+    "sigmoid",
+    "softmax",
+    "try_import_jax",
+    "try_import_tf",
+    "try_import_tfp",
+    "try_import_torch",
+    "ConstantSchedule",
+    "DeveloperAPI",
+    "ExponentialSchedule",
+    "Filter",
+    "FilterManager",
+    "LARGE_INTEGER",
+    "LinearSchedule",
+    "MAX_LOG_NN_OUTPUT",
+    "MIN_LOG_NN_OUTPUT",
+    "PiecewiseSchedule",
+    "PolynomialSchedule",
+    "PublicAPI",
+    "SMALL_NUMBER",
+]

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/checkpoints.cpython-311.pyc ADDED Viewed

Binary file (42.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/compression.cpython-311.pyc ADDED Viewed

Binary file (4.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/deprecation.cpython-311.pyc ADDED Viewed

Binary file (5.27 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/from_config.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/lambda_defaultdict.cpython-311.pyc ADDED Viewed

Binary file (2.79 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/memory.cpython-311.pyc ADDED Viewed

Binary file (523 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/serialization.cpython-311.pyc ADDED Viewed

Binary file (20.9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/torch_utils.cpython-311.pyc ADDED Viewed

Binary file (32 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/utils/actors.py ADDED Viewed

	@@ -0,0 +1,258 @@

+from collections import defaultdict, deque
+import logging
+import platform
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Type
+import ray
+from ray.actor import ActorClass, ActorHandle
+logger = logging.getLogger(__name__)
+class TaskPool:
+    """Helper class for tracking the status of many in-flight actor tasks."""
+    def __init__(self):
+        self._tasks = {}
+        self._objects = {}
+        self._fetching = deque()
+    def add(self, worker, all_obj_refs):
+        if isinstance(all_obj_refs, list):
+            obj_ref = all_obj_refs[0]
+        else:
+            obj_ref = all_obj_refs
+        self._tasks[obj_ref] = worker
+        self._objects[obj_ref] = all_obj_refs
+    def completed(self, blocking_wait=False):
+        pending = list(self._tasks)
+        if pending:
+            ready, _ = ray.wait(pending, num_returns=len(pending), timeout=0)
+            if not ready and blocking_wait:
+                ready, _ = ray.wait(pending, num_returns=1, timeout=10.0)
+            for obj_ref in ready:
+                yield (self._tasks.pop(obj_ref), self._objects.pop(obj_ref))
+    def completed_prefetch(self, blocking_wait=False, max_yield=999):
+        """Similar to completed but only returns once the object is local.
+        Assumes obj_ref only is one id."""
+        for worker, obj_ref in self.completed(blocking_wait=blocking_wait):
+            self._fetching.append((worker, obj_ref))
+        for _ in range(max_yield):
+            if not self._fetching:
+                break
+            yield self._fetching.popleft()
+    def reset_workers(self, workers):
+        """Notify that some workers may be removed."""
+        for obj_ref, ev in self._tasks.copy().items():
+            if ev not in workers:
+                del self._tasks[obj_ref]
+                del self._objects[obj_ref]
+        # We want to keep the same deque reference so that we don't suffer from
+        # stale references in generators that are still in flight
+        for _ in range(len(self._fetching)):
+            ev, obj_ref = self._fetching.popleft()
+            if ev in workers:
+                # Re-queue items that are still valid
+                self._fetching.append((ev, obj_ref))
+    @property
+    def count(self):
+        return len(self._tasks)
+def create_colocated_actors(
+    actor_specs: Sequence[Tuple[Type, Any, Any, int]],
+    node: Optional[str] = "localhost",
+    max_attempts: int = 10,
+) -> Dict[Type, List[ActorHandle]]:
+    """Create co-located actors of any type(s) on any node.
+    Args:
+        actor_specs: Tuple/list with tuples consisting of: 1) The
+            (already @ray.remote) class(es) to construct, 2) c'tor args,
+            3) c'tor kwargs, and 4) the number of actors of that class with
+            given args/kwargs to construct.
+        node: The node to co-locate the actors on. By default ("localhost"),
+            place the actors on the node the caller of this function is
+            located on. Use None for indicating that any (resource fulfilling)
+            node in the cluster may be used.
+        max_attempts: The maximum number of co-location attempts to
+            perform before throwing an error.
+    Returns:
+        A dict mapping the created types to the list of n ActorHandles
+        created (and co-located) for that type.
+    """
+    if node == "localhost":
+        node = platform.node()
+    # Maps each entry in `actor_specs` to lists of already co-located actors.
+    ok = [[] for _ in range(len(actor_specs))]
+    # Try n times to co-locate all given actor types (`actor_specs`).
+    # With each (failed) attempt, increase the number of actors we try to
+    # create (on the same node), then kill the ones that have been created in
+    # excess.
+    for attempt in range(max_attempts):
+        # If any attempt to co-locate fails, set this to False and we'll do
+        # another attempt.
+        all_good = True
+        # Process all `actor_specs` in sequence.
+        for i, (typ, args, kwargs, count) in enumerate(actor_specs):
+            args = args or []  # Allow None.
+            kwargs = kwargs or {}  # Allow None.
+            # We don't have enough actors yet of this spec co-located on
+            # the desired node.
+            if len(ok[i]) < count:
+                co_located = try_create_colocated(
+                    cls=typ,
+                    args=args,
+                    kwargs=kwargs,
+                    count=count * (attempt + 1),
+                    node=node,
+                )
+                # If node did not matter (None), from here on, use the host
+                # that the first actor(s) are already co-located on.
+                if node is None:
+                    node = ray.get(co_located[0].get_host.remote())
+                # Add the newly co-located actors to the `ok` list.
+                ok[i].extend(co_located)
+                # If we still don't have enough -> We'll have to do another
+                # attempt.
+                if len(ok[i]) < count:
+                    all_good = False
+            # We created too many actors for this spec -> Kill/truncate
+            # the excess ones.
+            if len(ok[i]) > count:
+                for a in ok[i][count:]:
+                    a.__ray_terminate__.remote()
+                ok[i] = ok[i][:count]
+        # All `actor_specs` have been fulfilled, return lists of
+        # co-located actors.
+        if all_good:
+            return ok
+    raise Exception("Unable to create enough colocated actors -> aborting.")
+def try_create_colocated(
+    cls: Type[ActorClass],
+    args: List[Any],
+    count: int,
+    kwargs: Optional[List[Any]] = None,
+    node: Optional[str] = "localhost",
+) -> List[ActorHandle]:
+    """Tries to co-locate (same node) a set of Actors of the same type.
+    Returns a list of successfully co-located actors. All actors that could
+    not be co-located (with the others on the given node) will not be in this
+    list.
+    Creates each actor via it's remote() constructor and then checks, whether
+    it has been co-located (on the same node) with the other (already created)
+    ones. If not, terminates the just created actor.
+    Args:
+        cls: The Actor class to use (already @ray.remote "converted").
+        args: List of args to pass to the Actor's constructor. One item
+            per to-be-created actor (`count`).
+        count: Number of actors of the given `cls` to construct.
+        kwargs: Optional list of kwargs to pass to the Actor's constructor.
+            One item per to-be-created actor (`count`).
+        node: The node to co-locate the actors on. By default ("localhost"),
+            place the actors on the node the caller of this function is
+            located on. If None, will try to co-locate all actors on
+            any available node.
+    Returns:
+        List containing all successfully co-located actor handles.
+    """
+    if node == "localhost":
+        node = platform.node()
+    kwargs = kwargs or {}
+    actors = [cls.remote(*args, **kwargs) for _ in range(count)]
+    co_located, non_co_located = split_colocated(actors, node=node)
+    logger.info("Got {} colocated actors of {}".format(len(co_located), count))
+    for a in non_co_located:
+        a.__ray_terminate__.remote()
+    return co_located
+def split_colocated(
+    actors: List[ActorHandle],
+    node: Optional[str] = "localhost",
+) -> Tuple[List[ActorHandle], List[ActorHandle]]:
+    """Splits up given actors into colocated (on same node) and non colocated.
+    The co-location criterion depends on the `node` given:
+    If given (or default: platform.node()): Consider all actors that are on
+    that node "colocated".
+    If None: Consider the largest sub-set of actors that are all located on
+    the same node (whatever that node is) as "colocated".
+    Args:
+        actors: The list of actor handles to split into "colocated" and
+            "non colocated".
+        node: The node defining "colocation" criterion. If provided, consider
+            thos actors "colocated" that sit on this node. If None, use the
+            largest subset within `actors` that are sitting on the same
+            (any) node.
+    Returns:
+        Tuple of two lists: 1) Co-located ActorHandles, 2) non co-located
+        ActorHandles.
+    """
+    if node == "localhost":
+        node = platform.node()
+    # Get nodes of all created actors.
+    hosts = ray.get([a.get_host.remote() for a in actors])
+    # If `node` not provided, use the largest group of actors that sit on the
+    # same node, regardless of what that node is.
+    if node is None:
+        node_groups = defaultdict(set)
+        for host, actor in zip(hosts, actors):
+            node_groups[host].add(actor)
+        max_ = -1
+        largest_group = None
+        for host in node_groups:
+            if max_ < len(node_groups[host]):
+                max_ = len(node_groups[host])
+                largest_group = host
+        non_co_located = []
+        for host in node_groups:
+            if host != largest_group:
+                non_co_located.extend(list(node_groups[host]))
+        return list(node_groups[largest_group]), non_co_located
+    # Node provided (or default: localhost): Consider those actors "colocated"
+    # that were placed on `node`.
+    else:
+        # Split into co-located (on `node) and non-co-located (not on `node`).
+        co_located = []
+        non_co_located = []
+        for host, a in zip(hosts, actors):
+            # This actor has been placed on the correct node.
+            if host == node:
+                co_located.append(a)
+            # This actor has been placed on a different node.
+            else:
+                non_co_located.append(a)
+        return co_located, non_co_located
+def drop_colocated(actors: List[ActorHandle]) -> List[ActorHandle]:
+    colocated, non_colocated = split_colocated(actors)
+    for a in colocated:
+        a.__ray_terminate__.remote()
+    return non_colocated

.venv/lib/python3.11/site-packages/ray/rllib/utils/annotations.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from ray.rllib.utils.deprecation import Deprecated
+from ray.util.annotations import _mark_annotated
+def override(parent_cls):
+    """Decorator for documenting method overrides.
+    Args:
+        parent_cls: The superclass that provides the overridden method. If
+            `parent_class` does not actually have the method or the class, in which
+            method is defined is not a subclass of `parent_class`, an error is raised.
+    .. testcode::
+        :skipif: True
+        from ray.rllib.policy import Policy
+        class TorchPolicy(Policy):
+            ...
+            # Indicates that `TorchPolicy.loss()` overrides the parent
+            # Policy class' own `loss method. Leads to an error if Policy
+            # does not have a `loss` method.
+            @override(Policy)
+            def loss(self, model, action_dist, train_batch):
+                ...
+    """
+    class OverrideCheck:
+        def __init__(self, func, expected_parent_cls):
+            self.func = func
+            self.expected_parent_cls = expected_parent_cls
+        def __set_name__(self, owner, name):
+            # Check if the owner (the class) is a subclass of the expected base class
+            if not issubclass(owner, self.expected_parent_cls):
+                raise TypeError(
+                    f"When using the @override decorator, {owner.__name__} must be a "
+                    f"subclass of {parent_cls.__name__}!"
+                )
+            # Set the function as a regular method on the class.
+            setattr(owner, name, self.func)
+    def decorator(method):
+        # Check, whether `method` is actually defined by the parent class.
+        if method.__name__ not in dir(parent_cls):
+            raise NameError(
+                f"When using the @override decorator, {method.__name__} must override "
+                f"the respective method (with the same name) of {parent_cls.__name__}!"
+            )
+        # Check if the class is a subclass of the expected base class
+        OverrideCheck(method, parent_cls)
+        return method
+    return decorator
+def PublicAPI(obj):
+    """Decorator for documenting public APIs.
+    Public APIs are classes and methods exposed to end users of RLlib. You
+    can expect these APIs to remain stable across RLlib releases.
+    Subclasses that inherit from a ``@PublicAPI`` base class can be
+    assumed part of the RLlib public API as well (e.g., all Algorithm classes
+    are in public API because Algorithm is ``@PublicAPI``).
+    In addition, you can assume all algo configurations are part of their
+    public API as well.
+    .. testcode::
+        :skipif: True
+        # Indicates that the `Algorithm` class is exposed to end users
+        # of RLlib and will remain stable across RLlib releases.
+        from ray import tune
+        @PublicAPI
+        class Algorithm(tune.Trainable):
+            ...
+    """
+    _mark_annotated(obj)
+    return obj
+def DeveloperAPI(obj):
+    """Decorator for documenting developer APIs.
+    Developer APIs are classes and methods explicitly exposed to developers
+    for the purposes of building custom algorithms or advanced training
+    strategies on top of RLlib internals. You can generally expect these APIs
+    to be stable sans minor changes (but less stable than public APIs).
+    Subclasses that inherit from a ``@DeveloperAPI`` base class can be
+    assumed part of the RLlib developer API as well.
+    .. testcode::
+        :skipif: True
+        # Indicates that the `TorchPolicy` class is exposed to end users
+        # of RLlib and will remain (relatively) stable across RLlib
+        # releases.
+        from ray.rllib.policy import Policy
+        @DeveloperAPI
+        class TorchPolicy(Policy):
+            ...
+    """
+    _mark_annotated(obj)
+    return obj
+def ExperimentalAPI(obj):
+    """Decorator for documenting experimental APIs.
+    Experimental APIs are classes and methods that are in development and may
+    change at any time in their development process. You should not expect
+    these APIs to be stable until their tag is changed to `DeveloperAPI` or
+    `PublicAPI`.
+    Subclasses that inherit from a ``@ExperimentalAPI`` base class can be
+    assumed experimental as well.
+    .. testcode::
+        :skipif: True
+        from ray.rllib.policy import Policy
+        class TorchPolicy(Policy):
+            ...
+            # Indicates that the `TorchPolicy.loss` method is a new and
+            # experimental API and may change frequently in future
+            # releases.
+            @ExperimentalAPI
+            def loss(self, model, action_dist, train_batch):
+                ...
+    """
+    _mark_annotated(obj)
+    return obj
+def OldAPIStack(obj):
+    """Decorator for classes/methods/functions belonging to the old API stack.
+    These should be deprecated at some point after Ray 3.0 (RLlib GA).
+    It is recommended for users to start exploring (and coding against) the new API
+    stack instead.
+    """
+    # No effect yet.
+    _mark_annotated(obj)
+    return obj
+def OverrideToImplementCustomLogic(obj):
+    """Users should override this in their sub-classes to implement custom logic.
+    Used in Algorithm and Policy to tag methods that need overriding, e.g.
+    `Policy.loss()`.
+    .. testcode::
+        :skipif: True
+        from ray.rllib.policy.torch_policy import TorchPolicy
+        @overrides(TorchPolicy)
+        @OverrideToImplementCustomLogic
+        def loss(self, ...):
+            # implement custom loss function here ...
+            # ... w/o calling the corresponding `super().loss()` method.
+            ...
+    """
+    obj.__is_overridden__ = False
+    return obj
+def OverrideToImplementCustomLogic_CallToSuperRecommended(obj):
+    """Users should override this in their sub-classes to implement custom logic.
+    Thereby, it is recommended (but not required) to call the super-class'
+    corresponding method.
+    Used in Algorithm and Policy to tag methods that need overriding, but the
+    super class' method should still be called, e.g.
+    `Algorithm.setup()`.
+    .. testcode::
+        :skipif: True
+        from ray import tune
+        @overrides(tune.Trainable)
+        @OverrideToImplementCustomLogic_CallToSuperRecommended
+        def setup(self, config):
+            # implement custom setup logic here ...
+            super().setup(config)
+            # ... or here (after having called super()'s setup method.
+    """
+    obj.__is_overridden__ = False
+    return obj
+def is_overridden(obj):
+    """Check whether a function has been overridden.
+    Note, this only works for API calls decorated with OverrideToImplementCustomLogic
+    or OverrideToImplementCustomLogic_CallToSuperRecommended.
+    """
+    return getattr(obj, "__is_overridden__", True)
+# Backward compatibility.
+Deprecated = Deprecated

.venv/lib/python3.11/site-packages/ray/rllib/utils/checkpoints.py ADDED Viewed

	@@ -0,0 +1,1045 @@

+import abc
+import inspect
+import json
+import logging
+import os
+from packaging import version
+import pathlib
+import re
+import tempfile
+from types import MappingProxyType
+from typing import Any, Collection, Dict, List, Optional, Tuple, Union
+import pyarrow.fs
+import ray
+import ray.cloudpickle as pickle
+from ray.rllib.core import (
+    COMPONENT_LEARNER,
+    COMPONENT_LEARNER_GROUP,
+    COMPONENT_RL_MODULE,
+)
+from ray.rllib.utils import force_list
+from ray.rllib.utils.actor_manager import FaultTolerantActorManager
+from ray.rllib.utils.annotations import (
+    OldAPIStack,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.rllib.utils.serialization import NOT_SERIALIZABLE, serialize_type
+from ray.rllib.utils.typing import StateDict
+from ray.train import Checkpoint
+from ray.tune.utils.file_transfer import sync_dir_between_nodes
+from ray.util import log_once
+from ray.util.annotations import PublicAPI
+logger = logging.getLogger(__name__)
+# The current checkpoint version used by RLlib for Algorithm and Policy checkpoints.
+# History:
+# 0.1: Ray 2.0.0
+#  A single `checkpoint-[iter num]` file for Algorithm checkpoints
+#  within the checkpoint directory. Policy checkpoints not supported across all
+#  DL frameworks.
+# 1.0: Ray >=2.1.0
+#  An algorithm_state.pkl file for the state of the Algorithm (excluding
+#  individual policy states).
+#  One sub-dir inside the "policies" sub-dir for each policy with a
+#  dedicated policy_state.pkl in it for the policy state.
+# 1.1: Same as 1.0, but has a new "format" field in the rllib_checkpoint.json file
+# indicating, whether the checkpoint is `cloudpickle` (default) or `msgpack`.
+# 1.2: Introduces the checkpoint for the new Learner API if the Learner API is enabled.
+# 2.0: Introduces the Checkpointable API for all components on the new API stack
+# (if the Learner-, RLModule, EnvRunner, and ConnectorV2 APIs are enabled).
+CHECKPOINT_VERSION = version.Version("1.1")
+CHECKPOINT_VERSION_LEARNER_AND_ENV_RUNNER = version.Version("2.1")
+@PublicAPI(stability="alpha")
+class Checkpointable(abc.ABC):
+    """Abstract base class for a component of RLlib that can be checkpointed to disk.
+    Subclasses must implement the following APIs:
+    - save_to_path()
+    - restore_from_path()
+    - from_checkpoint()
+    - get_state()
+    - set_state()
+    - get_ctor_args_and_kwargs()
+    - get_metadata()
+    - get_checkpointable_components()
+    """
+    # The state file for the implementing class.
+    # This file contains any state information that does NOT belong to any subcomponent
+    # of the implementing class (which are `Checkpointable` themselves and thus should
+    # have their own state- and metadata files).
+    # After a `save_to_path([path])` this file can be found directly in: `path/`.
+    STATE_FILE_NAME = "state"
+    # The filename of the pickle file that contains the class information of the
+    # Checkpointable as well as all constructor args to be passed to such a class in
+    # order to construct a new instance.
+    CLASS_AND_CTOR_ARGS_FILE_NAME = "class_and_ctor_args.pkl"
+    # Subclasses may set this to their own metadata filename.
+    # The dict returned by self.get_metadata() is stored in this JSON file.
+    METADATA_FILE_NAME = "metadata.json"
+    def save_to_path(
+        self,
+        path: Optional[Union[str, pathlib.Path]] = None,
+        *,
+        state: Optional[StateDict] = None,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        use_msgpack: bool = False,
+    ) -> str:
+        """Saves the state of the implementing class (or `state`) to `path`.
+        The state of the implementing class is always saved in the following format:
+        .. testcode::
+            :skipif: True
+            path/
+                [component1]/
+                    [component1 subcomponentA]/
+                        ...
+                    [component1 subcomponentB]/
+                        ...
+                [component2]/
+                        ...
+                [cls.METADATA_FILE_NAME] (json)
+                [cls.STATE_FILE_NAME] (pkl|msgpack)
+        The main logic is to loop through all subcomponents of this Checkpointable
+        and call their respective `save_to_path` methods. Then save the remaining
+        (non subcomponent) state to this Checkpointable's STATE_FILE_NAME.
+        In the exception that a component is a FaultTolerantActorManager instance,
+        instead of calling `save_to_path` directly on that manager, the first healthy
+        actor is interpreted as the component and its `save_to_path` method is called.
+        Even if that actor is located on another node, the created file is automatically
+        synced to the local node.
+        Args:
+            path: The path to the directory to save the state of the implementing class
+                to. If `path` doesn't exist or is None, then a new directory will be
+                created (and returned).
+            state: An optional state dict to be used instead of getting a new state of
+                the implementing class through `self.get_state()`.
+            filesystem: PyArrow FileSystem to use to access data at the `path`.
+                If not specified, this is inferred from the URI scheme of `path`.
+            use_msgpack: Whether the state file should be written using msgpack and
+                msgpack_numpy (file extension is `.msgpack`), rather than pickle (file
+                extension is `.pkl`).
+        Returns:
+            The path (str) where the state has been saved.
+        """
+        # If no path is given create a local temporary directory.
+        if path is None:
+            import uuid
+            # Get the location of the temporary directory on the OS.
+            tmp_dir = pathlib.Path(tempfile.gettempdir())
+            # Create a random directory name.
+            random_dir_name = str(uuid.uuid4())
+            # Create the path, but do not craet the directory on the
+            # filesystem, yet. This is done by `PyArrow`.
+            path = path or tmp_dir / random_dir_name
+        # We need a string path for `pyarrow.fs.FileSystem.from_uri`.
+        path = path if isinstance(path, str) else path.as_posix()
+        # If we have no filesystem, figure it out.
+        if path and not filesystem:
+            # Note the path needs to be a path that is relative to the
+            # filesystem (e.g. `gs://tmp/...` -> `tmp/...`).
+            filesystem, path = pyarrow.fs.FileSystem.from_uri(path)
+        # Make sure, path exists.
+        filesystem.create_dir(path, recursive=True)
+        # Convert to `pathlib.Path` for easy handling.
+        path = pathlib.Path(path)
+        # Write metadata file to disk.
+        metadata = self.get_metadata()
+        if "checkpoint_version" not in metadata:
+            metadata["checkpoint_version"] = str(
+                CHECKPOINT_VERSION_LEARNER_AND_ENV_RUNNER
+            )
+        with filesystem.open_output_stream(
+            (path / self.METADATA_FILE_NAME).as_posix()
+        ) as f:
+            f.write(json.dumps(metadata).encode("utf-8"))
+        # Write the class and constructor args information to disk. Always use pickle
+        # for this, because this information contains classes and maybe other
+        # non-serializable data.
+        with filesystem.open_output_stream(
+            (path / self.CLASS_AND_CTOR_ARGS_FILE_NAME).as_posix()
+        ) as f:
+            pickle.dump(
+                {
+                    "class": type(self),
+                    "ctor_args_and_kwargs": self.get_ctor_args_and_kwargs(),
+                },
+                f,
+            )
+        # Get the entire state of this Checkpointable, or use provided `state`.
+        _state_provided = state is not None
+        state = state or self.get_state(
+            not_components=[c[0] for c in self.get_checkpointable_components()]
+        )
+        # Write components of `self` that themselves are `Checkpointable`.
+        for comp_name, comp in self.get_checkpointable_components():
+            # If subcomponent's name is not in `state`, ignore it and don't write this
+            # subcomponent's state to disk.
+            if _state_provided and comp_name not in state:
+                continue
+            comp_path = path / comp_name
+            # If component is an ActorManager, save the manager's first healthy
+            # actor's state to disk (even if it's on another node, in which case, we'll
+            # sync the generated file(s) back to this node).
+            if isinstance(comp, FaultTolerantActorManager):
+                actor_to_use = comp.healthy_actor_ids()[0]
+                def _get_ip(_=None):
+                    import ray
+                    return ray.util.get_node_ip_address()
+                _result = next(
+                    iter(
+                        comp.foreach_actor(
+                            _get_ip,
+                            remote_actor_ids=[actor_to_use],
+                        )
+                    )
+                )
+                if not _result.ok:
+                    raise _result.get()
+                worker_ip_addr = _result.get()
+                self_ip_addr = _get_ip()
+                # Save the state to a temporary location on the `actor_to_use`'s
+                # node.
+                comp_state_ref = None
+                if _state_provided:
+                    comp_state_ref = ray.put(state.pop(comp_name))
+                if worker_ip_addr == self_ip_addr:
+                    comp.foreach_actor(
+                        lambda w, _path=comp_path, _state=comp_state_ref, _use_msgpack=use_msgpack: (  # noqa
+                            w.save_to_path(
+                                _path,
+                                state=(
+                                    ray.get(_state)
+                                    if _state is not None
+                                    else w.get_state()
+                                ),
+                                use_msgpack=_use_msgpack,
+                            )
+                        ),
+                        remote_actor_ids=[actor_to_use],
+                    )
+                else:
+                    # Save the checkpoint to the temporary directory on the worker.
+                    def _save(w, _state=comp_state_ref, _use_msgpack=use_msgpack):
+                        import tempfile
+                        # Create a temporary directory on the worker.
+                        tmpdir = tempfile.mkdtemp()
+                        w.save_to_path(
+                            tmpdir,
+                            state=(
+                                ray.get(_state) if _state is not None else w.get_state()
+                            ),
+                            use_msgpack=_use_msgpack,
+                        )
+                        return tmpdir
+                    _result = next(
+                        iter(comp.foreach_actor(_save, remote_actor_ids=[actor_to_use]))
+                    )
+                    if not _result.ok:
+                        raise _result.get()
+                    worker_temp_dir = _result.get()
+                    # Sync the temporary directory from the worker to this node.
+                    sync_dir_between_nodes(
+                        worker_ip_addr,
+                        worker_temp_dir,
+                        self_ip_addr,
+                        str(comp_path),
+                    )
+                    # Remove the temporary directory on the worker.
+                    def _rmdir(_, _dir=worker_temp_dir):
+                        import shutil
+                        shutil.rmtree(_dir)
+                    comp.foreach_actor(_rmdir, remote_actor_ids=[actor_to_use])
+            # Local component (instance stored in a property of `self`).
+            else:
+                if _state_provided:
+                    comp_state = state.pop(comp_name)
+                else:
+                    comp_state = self.get_state(components=comp_name)[comp_name]
+                # By providing the `state` arg, we make sure that the component does not
+                # have to call its own `get_state()` anymore, but uses what's provided
+                # here.
+                comp.save_to_path(
+                    comp_path,
+                    filesystem=filesystem,
+                    state=comp_state,
+                    use_msgpack=use_msgpack,
+                )
+        # Write all the remaining state to disk.
+        filename = path / (
+            self.STATE_FILE_NAME + (".msgpack" if use_msgpack else ".pkl")
+        )
+        with filesystem.open_output_stream(filename.as_posix()) as f:
+            if use_msgpack:
+                msgpack = try_import_msgpack(error=True)
+                msgpack.dump(state, f)
+            else:
+                pickle.dump(state, f)
+        return str(path)
+    def restore_from_path(
+        self,
+        path: Union[str, pathlib.Path],
+        *,
+        component: Optional[str] = None,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        **kwargs,
+    ) -> None:
+        """Restores the state of the implementing class from the given path.
+        If the `component` arg is provided, `path` refers to a checkpoint of a
+        subcomponent of `self`, thus allowing the user to load only the subcomponent's
+        state into `self` without affecting any of the other state information (for
+        example, loading only the NN state into a Checkpointable, which contains such
+        an NN, but also has other state information that should NOT be changed by
+        calling this method).
+        The given `path` should have the following structure and contain the following
+        files:
+        .. testcode::
+            :skipif: True
+            path/
+                [component1]/
+                    [component1 subcomponentA]/
+                        ...
+                    [component1 subcomponentB]/
+                        ...
+                [component2]/
+                        ...
+                [cls.METADATA_FILE_NAME] (json)
+                [cls.STATE_FILE_NAME] (pkl|msgpack)
+        Note that the self.METADATA_FILE_NAME file is not required to restore the state.
+        Args:
+            path: The path to load the implementing class' state from or to load the
+                state of only one subcomponent's state of the implementing class (if
+                `component` is provided).
+            component: If provided, `path` is interpreted as the checkpoint path of only
+                the subcomponent and thus, only that subcomponent's state is
+                restored/loaded. All other state of `self` remains unchanged in this
+                case.
+            filesystem: PyArrow FileSystem to use to access data at the `path`. If not
+                specified, this is inferred from the URI scheme of `path`.
+            **kwargs: Forward compatibility kwargs.
+        """
+        path = path if isinstance(path, str) else path.as_posix()
+        if path and not filesystem:
+            # Note the path needs to be a path that is relative to the
+            # filesystem (e.g. `gs://tmp/...` -> `tmp/...`).
+            filesystem, path = pyarrow.fs.FileSystem.from_uri(path)
+        # Only here convert to a `Path` instance b/c otherwise
+        # cloud path gets broken (i.e. 'gs://' -> 'gs:/').
+        path = pathlib.Path(path)
+        if not _exists_at_fs_path(filesystem, path.as_posix()):
+            raise FileNotFoundError(f"`path` ({path}) not found!")
+        # Restore components of `self` that themselves are `Checkpointable`.
+        orig_comp_names = {c[0] for c in self.get_checkpointable_components()}
+        self._restore_all_subcomponents_from_path(
+            path, filesystem, component=component, **kwargs
+        )
+        # Restore the "base" state (not individual subcomponents).
+        if component is None:
+            filename = path / self.STATE_FILE_NAME
+            if filename.with_suffix(".msgpack").is_file():
+                msgpack = try_import_msgpack(error=True)
+                with filesystem.open_input_stream(
+                    filename.with_suffix(".msgpack").as_posix()
+                ) as f:
+                    state = msgpack.load(f, strict_map_key=False)
+            else:
+                with filesystem.open_input_stream(
+                    filename.with_suffix(".pkl").as_posix()
+                ) as f:
+                    state = pickle.load(f)
+            self.set_state(state)
+            new_comp_names = {c[0] for c in self.get_checkpointable_components()}
+            diff_comp_names = new_comp_names - orig_comp_names
+            if diff_comp_names:
+                self._restore_all_subcomponents_from_path(
+                    path, filesystem, only_comp_names=diff_comp_names, **kwargs
+                )
+    @classmethod
+    def from_checkpoint(
+        cls,
+        path: Union[str, pathlib.Path],
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        **kwargs,
+    ) -> "Checkpointable":
+        """Creates a new Checkpointable instance from the given location and returns it.
+        Args:
+            path: The checkpoint path to load (a) the information on how to construct
+                a new instance of the implementing class and (b) the state to restore
+                the created instance to.
+            filesystem: PyArrow FileSystem to use to access data at the `path`. If not
+                specified, this is inferred from the URI scheme of `path`.
+            kwargs: Forward compatibility kwargs. Note that these kwargs are sent to
+                each subcomponent's `from_checkpoint()` call.
+        Returns:
+             A new instance of the implementing class, already set to the state stored
+             under `path`.
+        """
+        # We need a string path for the `PyArrow` filesystem.
+        path = path if isinstance(path, str) else path.as_posix()
+        # If no filesystem is passed in create one.
+        if path and not filesystem:
+            # Note the path needs to be a path that is relative to the
+            # filesystem (e.g. `gs://tmp/...` -> `tmp/...`).
+            filesystem, path = pyarrow.fs.FileSystem.from_uri(path)
+        # Only here convert to a `Path` instance b/c otherwise
+        # cloud path gets broken (i.e. 'gs://' -> 'gs:/').
+        path = pathlib.Path(path)
+        # Get the class constructor to call and its args/kwargs.
+        # Try reading the pickle file first.
+        try:
+            with filesystem.open_input_stream(
+                (path / cls.CLASS_AND_CTOR_ARGS_FILE_NAME).as_posix()
+            ) as f:
+                ctor_info = pickle.load(f)
+            ctor = ctor_info["class"]
+            ctor_args = force_list(ctor_info["ctor_args_and_kwargs"][0])
+            ctor_kwargs = ctor_info["ctor_args_and_kwargs"][1]
+            # Inspect the ctor to see, which arguments in ctor_info should be replaced
+            # with the user provided **kwargs.
+            for i, (param_name, param) in enumerate(
+                inspect.signature(ctor).parameters.items()
+            ):
+                if param_name in kwargs:
+                    val = kwargs.pop(param_name)
+                    if (
+                        param.kind == inspect._ParameterKind.POSITIONAL_OR_KEYWORD
+                        and len(ctor_args) > i
+                    ):
+                        ctor_args[i] = val
+                    else:
+                        ctor_kwargs[param_name] = val
+        # If the pickle file is from another python version, use provided
+        # args instead.
+        except Exception:
+            # Use class that this method was called on.
+            ctor = cls
+            # Use only user provided **kwargs.
+            ctor_args = []
+            ctor_kwargs = kwargs
+        # Check, whether the constructor actually goes together with `cls`.
+        if not issubclass(ctor, cls):
+            raise ValueError(
+                f"The class ({ctor}) stored in checkpoint ({path}) does not seem to be "
+                f"a subclass of `cls` ({cls})!"
+            )
+        elif not issubclass(ctor, Checkpointable):
+            raise ValueError(
+                f"The class ({ctor}) stored in checkpoint ({path}) does not seem to be "
+                "an implementer of the `Checkpointable` API!"
+            )
+        # Construct the initial object (without any particular state).
+        obj = ctor(*ctor_args, **ctor_kwargs)
+        # Restore the state of the constructed object.
+        obj.restore_from_path(path, filesystem=filesystem, **kwargs)
+        # Return the new object.
+        return obj
+    @abc.abstractmethod
+    def get_state(
+        self,
+        components: Optional[Union[str, Collection[str]]] = None,
+        *,
+        not_components: Optional[Union[str, Collection[str]]] = None,
+        **kwargs,
+    ) -> StateDict:
+        """Returns the implementing class's current state as a dict.
+        The returned dict must only contain msgpack-serializable data if you want to
+        use the `AlgorithmConfig._msgpack_checkpoints` option. Consider returning your
+        non msgpack-serializable data from the `Checkpointable.get_ctor_args_and_kwargs`
+        method, instead.
+        Args:
+            components: An optional collection of string keys to be included in the
+                returned state. This might be useful, if getting certain components
+                of the state is expensive (e.g. reading/compiling the weights of a large
+                NN) and at the same time, these components are not required by the
+                caller.
+            not_components: An optional list of string keys to be excluded in the
+                returned state, even if the same string is part of `components`.
+                This is useful to get the complete state of the class, except
+                one or a few components.
+            kwargs: Forward-compatibility kwargs.
+        Returns:
+            The current state of the implementing class (or only the `components`
+            specified, w/o those in `not_components`).
+        """
+    @abc.abstractmethod
+    def set_state(self, state: StateDict) -> None:
+        """Sets the implementing class' state to the given state dict.
+        If component keys are missing in `state`, these components of the implementing
+        class will not be updated/set.
+        Args:
+            state: The state dict to restore the state from. Maps component keys
+                to the corresponding subcomponent's own state.
+        """
+    @abc.abstractmethod
+    def get_ctor_args_and_kwargs(self) -> Tuple[Tuple, Dict[str, Any]]:
+        """Returns the args/kwargs used to create `self` from its constructor.
+        Returns:
+            A tuple of the args (as a tuple) and kwargs (as a Dict[str, Any]) used to
+            construct `self` from its class constructor.
+        """
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def get_metadata(self) -> Dict:
+        """Returns JSON writable metadata further describing the implementing class.
+        Note that this metadata is NOT part of any state and is thus NOT needed to
+        restore the state of a Checkpointable instance from a directory. Rather, the
+        metadata will be written into `self.METADATA_FILE_NAME` when calling
+        `self.save_to_path()` for the user's convenience.
+        Returns:
+            A JSON-encodable dict of metadata information.
+        """
+        return {
+            "class_and_ctor_args_file": self.CLASS_AND_CTOR_ARGS_FILE_NAME,
+            "state_file": self.STATE_FILE_NAME,
+            "ray_version": ray.__version__,
+            "ray_commit": ray.__commit__,
+        }
+    def get_checkpointable_components(self) -> List[Tuple[str, "Checkpointable"]]:
+        """Returns the implementing class's own Checkpointable subcomponents.
+        Returns:
+            A list of 2-tuples (name, subcomponent) describing the implementing class'
+            subcomponents, all of which have to be `Checkpointable` themselves and
+            whose state is therefore written into subdirectories (rather than the main
+            state file (self.STATE_FILE_NAME) when calling `self.save_to_path()`).
+        """
+        return []
+    def _check_component(self, name, components, not_components) -> bool:
+        comp_list = force_list(components)
+        not_comp_list = force_list(not_components)
+        if (
+            components is None
+            or any(c.startswith(name + "/") for c in comp_list)
+            or name in comp_list
+        ) and (not_components is None or name not in not_comp_list):
+            return True
+        return False
+    def _get_subcomponents(self, name, components):
+        if components is None:
+            return None
+        components = force_list(components)
+        subcomponents = []
+        for comp in components:
+            if comp.startswith(name + "/"):
+                subcomponents.append(comp[len(name) + 1 :])
+        return None if not subcomponents else subcomponents
+    def _restore_all_subcomponents_from_path(
+        self, path, filesystem, only_comp_names=None, component=None, **kwargs
+    ):
+        for comp_name, comp in self.get_checkpointable_components():
+            if only_comp_names is not None and comp_name not in only_comp_names:
+                continue
+            # The value of the `component` argument for the upcoming
+            # `[subcomponent].restore_from_path(.., component=..)` call.
+            comp_arg = None
+            if component is None:
+                comp_dir = path / comp_name
+                # If subcomponent's dir is not in path, ignore it and don't restore this
+                # subcomponent's state from disk.
+                if not _exists_at_fs_path(filesystem, comp_dir.as_posix()):
+                    continue
+            else:
+                comp_dir = path
+                # `component` is a path that starts with `comp` -> Remove the name of
+                # `comp` from the `component` arg in the upcoming call to `restore_..`.
+                if component.startswith(comp_name + "/"):
+                    comp_arg = component[len(comp_name) + 1 :]
+                # `component` has nothing to do with `comp` -> Skip.
+                elif component != comp_name:
+                    continue
+            # If component is an ActorManager, restore all the manager's healthy
+            # actors' states from disk (even if they are on another node, in which case,
+            # we'll sync checkpoint file(s) to the respective node).
+            if isinstance(comp, FaultTolerantActorManager):
+                head_node_ip = ray.util.get_node_ip_address()
+                all_healthy_actors = comp.healthy_actor_ids()
+                def _restore(
+                    w,
+                    _kwargs=MappingProxyType(kwargs),
+                    _path=comp_dir,
+                    _head_ip=head_node_ip,
+                    _comp_arg=comp_arg,
+                ):
+                    import ray
+                    import tempfile
+                    worker_node_ip = ray.util.get_node_ip_address()
+                    # If the worker is on the same node as the head, load the checkpoint
+                    # directly from the path otherwise sync the checkpoint from the head
+                    # to the worker and load it from there.
+                    if worker_node_ip == _head_ip:
+                        w.restore_from_path(_path, component=_comp_arg, **_kwargs)
+                    else:
+                        with tempfile.TemporaryDirectory() as temp_dir:
+                            sync_dir_between_nodes(
+                                _head_ip, _path, worker_node_ip, temp_dir
+                            )
+                            w.restore_from_path(
+                                temp_dir, component=_comp_arg, **_kwargs
+                            )
+                comp.foreach_actor(_restore, remote_actor_ids=all_healthy_actors)
+            # Call `restore_from_path()` on local subcomponent, thereby passing in the
+            # **kwargs.
+            else:
+                comp.restore_from_path(
+                    comp_dir, filesystem=filesystem, component=comp_arg, **kwargs
+                )
+def _exists_at_fs_path(fs: pyarrow.fs.FileSystem, path: str) -> bool:
+    """Returns `True` if the path can be found in the filesystem."""
+    valid = fs.get_file_info(path)
+    return valid.type != pyarrow.fs.FileType.NotFound
+def _is_dir(file_info: pyarrow.fs.FileInfo) -> bool:
+    """Returns `True`, if the file info is from a directory."""
+    return file_info.type == pyarrow.fs.FileType.Directory
+@OldAPIStack
+def get_checkpoint_info(
+    checkpoint: Union[str, Checkpoint],
+    filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+) -> Dict[str, Any]:
+    """Returns a dict with information about an Algorithm/Policy checkpoint.
+    If the given checkpoint is a >=v1.0 checkpoint directory, try reading all
+    information from the contained `rllib_checkpoint.json` file.
+    Args:
+        checkpoint: The checkpoint directory (str) or an AIR Checkpoint object.
+        filesystem: PyArrow FileSystem to use to access data at the `checkpoint`. If not
+            specified, this is inferred from the URI scheme provided by `checkpoint`.
+    Returns:
+        A dict containing the keys:
+        "type": One of "Policy" or "Algorithm".
+        "checkpoint_version": A version tuple, e.g. v1.0, indicating the checkpoint
+        version. This will help RLlib to remain backward compatible wrt. future
+        Ray and checkpoint versions.
+        "checkpoint_dir": The directory with all the checkpoint files in it. This might
+        be the same as the incoming `checkpoint` arg.
+        "state_file": The main file with the Algorithm/Policy's state information in it.
+        This is usually a pickle-encoded file.
+        "policy_ids": An optional set of PolicyIDs in case we are dealing with an
+        Algorithm checkpoint. None if `checkpoint` is a Policy checkpoint.
+    """
+    # Default checkpoint info.
+    info = {
+        "type": "Algorithm",
+        "format": "cloudpickle",
+        "checkpoint_version": CHECKPOINT_VERSION,
+        "checkpoint_dir": None,
+        "state_file": None,
+        "policy_ids": None,
+        "module_ids": None,
+    }
+    # `checkpoint` is a Checkpoint instance: Translate to directory and continue.
+    if isinstance(checkpoint, Checkpoint):
+        checkpoint = checkpoint.to_directory()
+    if checkpoint and not filesystem:
+        # Note the path needs to be a path that is relative to the
+        # filesystem (e.g. `gs://tmp/...` -> `tmp/...`).
+        filesystem, checkpoint = pyarrow.fs.FileSystem.from_uri(checkpoint)
+    # Only here convert to a `Path` instance b/c otherwise
+    # cloud path gets broken (i.e. 'gs://' -> 'gs:/').
+    checkpoint = pathlib.Path(checkpoint)
+    # Checkpoint is dir.
+    if _exists_at_fs_path(filesystem, checkpoint.as_posix()) and _is_dir(
+        filesystem.get_file_info(checkpoint.as_posix())
+    ):
+        info.update({"checkpoint_dir": str(checkpoint)})
+        # Figure out whether this is an older checkpoint format
+        # (with a `checkpoint-\d+` file in it).
+        file_info_list = filesystem.get_file_info(
+            pyarrow.fs.FileSelector(checkpoint.as_posix(), recursive=False)
+        )
+        for file_info in file_info_list:
+            if file_info.is_file:
+                if re.match("checkpoint-\\d+", file_info.base_name):
+                    info.update(
+                        {
+                            "checkpoint_version": version.Version("0.1"),
+                            "state_file": str(file_info.base_name),
+                        }
+                    )
+                    return info
+        # No old checkpoint file found.
+        # If rllib_checkpoint.json file present, read available information from it
+        # and then continue with the checkpoint analysis (possibly overriding further
+        # information).
+        if _exists_at_fs_path(
+            filesystem, (checkpoint / "rllib_checkpoint.json").as_posix()
+        ):
+            # if (checkpoint / "rllib_checkpoint.json").is_file():
+            with filesystem.open_input_stream(
+                (checkpoint / "rllib_checkpoint.json").as_posix()
+            ) as f:
+                # with open(checkpoint / "rllib_checkpoint.json") as f:
+                rllib_checkpoint_info = json.load(fp=f)
+            if "checkpoint_version" in rllib_checkpoint_info:
+                rllib_checkpoint_info["checkpoint_version"] = version.Version(
+                    rllib_checkpoint_info["checkpoint_version"]
+                )
+            info.update(rllib_checkpoint_info)
+        else:
+            # No rllib_checkpoint.json file present: Warn and continue trying to figure
+            # out checkpoint info ourselves.
+            if log_once("no_rllib_checkpoint_json_file"):
+                logger.warning(
+                    "No `rllib_checkpoint.json` file found in checkpoint directory "
+                    f"{checkpoint}! Trying to extract checkpoint info from other files "
+                    f"found in that dir."
+                )
+        # Policy checkpoint file found.
+        for extension in ["pkl", "msgpck"]:
+            if _exists_at_fs_path(
+                filesystem, (checkpoint / ("policy_state." + extension)).as_posix()
+            ):
+                # if (checkpoint / ("policy_state." + extension)).is_file():
+                info.update(
+                    {
+                        "type": "Policy",
+                        "format": "cloudpickle" if extension == "pkl" else "msgpack",
+                        "checkpoint_version": CHECKPOINT_VERSION,
+                        "state_file": str(checkpoint / f"policy_state.{extension}"),
+                    }
+                )
+                return info
+        # Valid Algorithm checkpoint >v0 file found?
+        format = None
+        for extension in ["pkl", "msgpck", "msgpack"]:
+            state_file = checkpoint / f"algorithm_state.{extension}"
+            if (
+                _exists_at_fs_path(filesystem, state_file.as_posix())
+                and filesystem.get_file_info(state_file.as_posix()).is_file
+            ):
+                format = "cloudpickle" if extension == "pkl" else "msgpack"
+                break
+        if format is None:
+            raise ValueError(
+                "Given checkpoint does not seem to be valid! No file with the name "
+                "`algorithm_state.[pkl|msgpack|msgpck]` (or `checkpoint-[0-9]+`) found."
+            )
+        info.update(
+            {
+                "format": format,
+                "state_file": str(state_file),
+            }
+        )
+        # Collect all policy IDs in the sub-dir "policies/".
+        policies_dir = checkpoint / "policies"
+        if _exists_at_fs_path(filesystem, policies_dir.as_posix()) and _is_dir(
+            filesystem.get_file_info(policies_dir.as_posix())
+        ):
+            policy_ids = set()
+            file_info_list = filesystem.get_file_info(
+                pyarrow.fs.FileSelector(policies_dir.as_posix(), recursive=False)
+            )
+            for file_info in file_info_list:
+                policy_ids.add(file_info.base_name)
+            info.update({"policy_ids": policy_ids})
+        # Collect all module IDs in the sub-dir "learner/module_state/".
+        modules_dir = (
+            checkpoint
+            / COMPONENT_LEARNER_GROUP
+            / COMPONENT_LEARNER
+            / COMPONENT_RL_MODULE
+        )
+        if _exists_at_fs_path(filesystem, checkpoint.as_posix()) and _is_dir(
+            filesystem.get_file_info(modules_dir.as_posix())
+        ):
+            module_ids = set()
+            file_info_list = filesystem.get_file_info(
+                pyarrow.fs.FileSelector(modules_dir.as_posix(), recursive=False)
+            )
+            for file_info in file_info_list:
+                # Only add subdirs (those are the ones where the RLModule data
+                # is stored, not files (could be json metadata files).
+                module_dir = modules_dir / file_info.base_name
+                if _is_dir(filesystem.get_file_info(module_dir.as_posix())):
+                    module_ids.add(file_info.base_name)
+            info.update({"module_ids": module_ids})
+    # Checkpoint is a file: Use as-is (interpreting it as old Algorithm checkpoint
+    # version).
+    elif (
+        _exists_at_fs_path(filesystem, checkpoint.as_posix())
+        and filesystem.get_file_info(checkpoint.as_posix()).is_file
+    ):
+        info.update(
+            {
+                "checkpoint_version": version.Version("0.1"),
+                "checkpoint_dir": str(checkpoint.parent),
+                "state_file": str(checkpoint),
+            }
+        )
+    else:
+        raise ValueError(
+            f"Given checkpoint ({str(checkpoint)}) not found! Must be a "
+            "checkpoint directory (or a file for older checkpoint versions)."
+        )
+    return info
+@OldAPIStack
+def convert_to_msgpack_checkpoint(
+    checkpoint: Union[str, Checkpoint],
+    msgpack_checkpoint_dir: str,
+) -> str:
+    """Converts an Algorithm checkpoint (pickle based) to a msgpack based one.
+    Msgpack has the advantage of being python version independent.
+    Args:
+        checkpoint: The directory, in which to find the Algorithm checkpoint (pickle
+            based).
+        msgpack_checkpoint_dir: The directory, in which to create the new msgpack
+            based checkpoint.
+    Returns:
+        The directory in which the msgpack checkpoint has been created. Note that
+        this is the same as `msgpack_checkpoint_dir`.
+    """
+    from ray.rllib.algorithms import Algorithm
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+    from ray.rllib.core.rl_module import validate_module_id
+    # Try to import msgpack and msgpack_numpy.
+    msgpack = try_import_msgpack(error=True)
+    # Restore the Algorithm using the python version dependent checkpoint.
+    algo = Algorithm.from_checkpoint(checkpoint)
+    state = algo.__getstate__()
+    # Convert all code in state into serializable data.
+    # Serialize the algorithm class.
+    state["algorithm_class"] = serialize_type(state["algorithm_class"])
+    # Serialize the algorithm's config object.
+    if not isinstance(state["config"], dict):
+        state["config"] = state["config"].serialize()
+    else:
+        state["config"] = AlgorithmConfig._serialize_dict(state["config"])
+    # Extract policy states from worker state (Policies get their own
+    # checkpoint sub-dirs).
+    policy_states = {}
+    if "worker" in state and "policy_states" in state["worker"]:
+        policy_states = state["worker"].pop("policy_states", {})
+    # Policy mapping fn.
+    state["worker"]["policy_mapping_fn"] = NOT_SERIALIZABLE
+    # Is Policy to train function.
+    state["worker"]["is_policy_to_train"] = NOT_SERIALIZABLE
+    # Add RLlib checkpoint version (as string).
+    state["checkpoint_version"] = str(CHECKPOINT_VERSION)
+    # Write state (w/o policies) to disk.
+    state_file = os.path.join(msgpack_checkpoint_dir, "algorithm_state.msgpck")
+    with open(state_file, "wb") as f:
+        msgpack.dump(state, f)
+    # Write rllib_checkpoint.json.
+    with open(os.path.join(msgpack_checkpoint_dir, "rllib_checkpoint.json"), "w") as f:
+        json.dump(
+            {
+                "type": "Algorithm",
+                "checkpoint_version": state["checkpoint_version"],
+                "format": "msgpack",
+                "state_file": state_file,
+                "policy_ids": list(policy_states.keys()),
+                "ray_version": ray.__version__,
+                "ray_commit": ray.__commit__,
+            },
+            f,
+        )
+    # Write individual policies to disk, each in their own subdirectory.
+    for pid, policy_state in policy_states.items():
+        # From here on, disallow policyIDs that would not work as directory names.
+        validate_module_id(pid, error=True)
+        policy_dir = os.path.join(msgpack_checkpoint_dir, "policies", pid)
+        os.makedirs(policy_dir, exist_ok=True)
+        policy = algo.get_policy(pid)
+        policy.export_checkpoint(
+            policy_dir,
+            policy_state=policy_state,
+            checkpoint_format="msgpack",
+        )
+    # Release all resources used by the Algorithm.
+    algo.stop()
+    return msgpack_checkpoint_dir
+@OldAPIStack
+def convert_to_msgpack_policy_checkpoint(
+    policy_checkpoint: Union[str, Checkpoint],
+    msgpack_checkpoint_dir: str,
+) -> str:
+    """Converts a Policy checkpoint (pickle based) to a msgpack based one.
+    Msgpack has the advantage of being python version independent.
+    Args:
+        policy_checkpoint: The directory, in which to find the Policy checkpoint (pickle
+            based).
+        msgpack_checkpoint_dir: The directory, in which to create the new msgpack
+            based checkpoint.
+    Returns:
+        The directory in which the msgpack checkpoint has been created. Note that
+        this is the same as `msgpack_checkpoint_dir`.
+    """
+    from ray.rllib.policy.policy import Policy
+    policy = Policy.from_checkpoint(policy_checkpoint)
+    os.makedirs(msgpack_checkpoint_dir, exist_ok=True)
+    policy.export_checkpoint(
+        msgpack_checkpoint_dir,
+        policy_state=policy.get_state(),
+        checkpoint_format="msgpack",
+    )
+    # Release all resources used by the Policy.
+    del policy
+    return msgpack_checkpoint_dir
+@PublicAPI
+def try_import_msgpack(error: bool = False):
+    """Tries importing msgpack and msgpack_numpy and returns the patched msgpack module.
+    Returns None if error is False and msgpack or msgpack_numpy is not installed.
+    Raises an error, if error is True and the modules could not be imported.
+    Args:
+        error: Whether to raise an error if msgpack/msgpack_numpy cannot be imported.
+    Returns:
+        The `msgpack` module.
+    Raises:
+        ImportError: If error=True and msgpack/msgpack_numpy is not installed.
+    """
+    try:
+        import msgpack
+        import msgpack_numpy
+        # Make msgpack_numpy look like msgpack.
+        msgpack_numpy.patch()
+        return msgpack
+    except Exception:
+        if error:
+            raise ImportError(
+                "Could not import or setup msgpack and msgpack_numpy! "
+                "Try running `pip install msgpack msgpack_numpy` first."
+            )

.venv/lib/python3.11/site-packages/ray/rllib/utils/deprecation.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import inspect
+import logging
+from typing import Optional, Union
+from ray.util import log_once
+from ray.util.annotations import _mark_annotated
+logger = logging.getLogger(__name__)
+# A constant to use for any configuration that should be deprecated
+# (to check, whether this config has actually been assigned a proper value or
+# not).
+DEPRECATED_VALUE = -1
+def deprecation_warning(
+    old: str,
+    new: Optional[str] = None,
+    *,
+    help: Optional[str] = None,
+    error: Optional[Union[bool, Exception]] = None,
+) -> None:
+    """Warns (via the `logger` object) or throws a deprecation warning/error.
+    Args:
+        old: A description of the "thing" that is to be deprecated.
+        new: A description of the new "thing" that replaces it.
+        help: An optional help text to tell the user, what to
+            do instead of using `old`.
+        error: Whether or which exception to raise. If True, raise ValueError.
+            If False, just warn. If `error` is-a subclass of Exception,
+            raise that Exception.
+    Raises:
+        ValueError: If `error=True`.
+        Exception: Of type `error`, iff `error` is a sub-class of `Exception`.
+    """
+    msg = "`{}` has been deprecated.{}".format(
+        old, (" Use `{}` instead.".format(new) if new else f" {help}" if help else "")
+    )
+    if error:
+        if not isinstance(error, bool) and issubclass(error, Exception):
+            # error is an Exception
+            raise error(msg)
+        else:
+            # error is a boolean, construct ValueError ourselves
+            raise ValueError(msg)
+    else:
+        logger.warning(
+            "DeprecationWarning: " + msg + " This will raise an error in the future!"
+        )
+def Deprecated(old=None, *, new=None, help=None, error):
+    """Decorator for documenting a deprecated class, method, or function.
+    Automatically adds a `deprecation.deprecation_warning(old=...,
+    error=False)` to not break existing code at this point to the decorated
+    class' constructor, method, or function.
+    In a next major release, this warning should then be made an error
+    (by setting error=True), which means at this point that the
+    class/method/function is no longer supported, but will still inform
+    the user about the deprecation event.
+    In a further major release, the class, method, function should be erased
+    entirely from the codebase.
+    .. testcode::
+        :skipif: True
+        from ray.rllib.utils.deprecation import Deprecated
+        # Deprecated class: Patches the constructor to warn if the class is
+        # used.
+        @Deprecated(new="NewAndMuchCoolerClass", error=False)
+        class OldAndUncoolClass:
+            ...
+        # Deprecated class method: Patches the method to warn if called.
+        class StillCoolClass:
+            ...
+            @Deprecated(new="StillCoolClass.new_and_much_cooler_method()",
+                        error=False)
+            def old_and_uncool_method(self, uncool_arg):
+                ...
+        # Deprecated function: Patches the function to warn if called.
+        @Deprecated(new="new_and_much_cooler_function", error=False)
+        def old_and_uncool_function(*uncool_args):
+            ...
+    """
+    def _inner(obj):
+        # A deprecated class.
+        if inspect.isclass(obj):
+            # Patch the class' init method to raise the warning/error.
+            obj_init = obj.__init__
+            def patched_init(*args, **kwargs):
+                if log_once(old or obj.__name__):
+                    deprecation_warning(
+                        old=old or obj.__name__,
+                        new=new,
+                        help=help,
+                        error=error,
+                    )
+                return obj_init(*args, **kwargs)
+            obj.__init__ = patched_init
+            _mark_annotated(obj)
+            # Return the patched class (with the warning/error when
+            # instantiated).
+            return obj
+        # A deprecated class method or function.
+        # Patch with the warning/error at the beginning.
+        def _ctor(*args, **kwargs):
+            if log_once(old or obj.__name__):
+                deprecation_warning(
+                    old=old or obj.__name__,
+                    new=new,
+                    help=help,
+                    error=error,
+                )
+            # Call the deprecated method/function.
+            return obj(*args, **kwargs)
+        # Return the patched class method/function.
+        return _ctor
+    # Return the prepared decorator.
+    return _inner

.venv/lib/python3.11/site-packages/ray/rllib/utils/error.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from ray.rllib.utils.annotations import PublicAPI
+@PublicAPI
+class UnsupportedSpaceException(Exception):
+    """Error for an unsupported action or observation space."""
+    pass
+@PublicAPI
+class EnvError(Exception):
+    """Error if we encounter an error during RL environment validation."""
+    pass
+@PublicAPI
+class MultiAgentEnvError(Exception):
+    """Error if we encounter an error during MultiAgentEnv stepping/validation."""
+    pass
+@PublicAPI
+class NotSerializable(Exception):
+    """Error if we encounter objects that can't be serialized by ray."""
+    pass
+# -------
+# Error messages
+# -------
+# Message explaining there are no GPUs available for the
+# num_gpus=n or num_gpus_per_env_runner=m settings.
+ERR_MSG_NO_GPUS = """Found {} GPUs on your machine (GPU devices found: {})! If your
+    machine does not have any GPUs, you should set the config keys
+    `num_gpus_per_learner` and `num_gpus_per_env_runner` to 0. They may be set to
+    1 by default for your particular RL algorithm."""
+ERR_MSG_INVALID_ENV_DESCRIPTOR = """The env string you provided ('{}') is:
+a) Not a supported or -installed environment.
+b) Not a tune-registered environment creator.
+c) Not a valid env class string.
+Try one of the following:
+a) For Atari support: `pip install gym[atari] autorom[accept-rom-license]`.
+   For PyBullet support: `pip install pybullet`.
+b) To register your custom env, do `from ray import tune;
+   tune.register('[name]', lambda cfg: [return env obj from here using cfg])`.
+   Then in your config, do `config['env'] = [name]`.
+c) Make sure you provide a fully qualified classpath, e.g.:
+   `ray.rllib.examples.envs.classes.repeat_after_me_env.RepeatAfterMeEnv`
+"""
+ERR_MSG_OLD_GYM_API = """Your environment ({}) does not abide to the new gymnasium-style API!
+From Ray 2.3 on, RLlib only supports the new (gym>=0.26 or gymnasium) Env APIs.
+{}
+Learn more about the most important changes here:
+https://github.com/openai/gym and here: https://github.com/Farama-Foundation/Gymnasium
+In order to fix this problem, do the following:
+1) Run `pip install gymnasium` on your command line.
+2) Change all your import statements in your code from
+   `import gym` -> `import gymnasium as gym` OR
+   `from gym.spaces import Discrete` -> `from gymnasium.spaces import Discrete`
+For your custom (single agent) gym.Env classes:
+3.1) Either wrap your old Env class via the provided `from gymnasium.wrappers import
+     EnvCompatibility` wrapper class.
+3.2) Alternatively to 3.1:
+ - Change your `reset()` method to have the call signature 'def reset(self, *,
+   seed=None, options=None)'
+ - Return an additional info dict (empty dict should be fine) from your `reset()`
+   method.
+ - Return an additional `truncated` flag from your `step()` method (between `done` and
+   `info`). This flag should indicate, whether the episode was terminated prematurely
+   due to some time constraint or other kind of horizon setting.
+For your custom RLlib `MultiAgentEnv` classes:
+4.1) Either wrap your old MultiAgentEnv via the provided
+     `from ray.rllib.env.wrappers.multi_agent_env_compatibility import
+     MultiAgentEnvCompatibility` wrapper class.
+4.2) Alternatively to 4.1:
+ - Change your `reset()` method to have the call signature
+   'def reset(self, *, seed=None, options=None)'
+ - Return an additional per-agent info dict (empty dict should be fine) from your
+   `reset()` method.
+ - Rename `dones` into `terminateds` and only set this to True, if the episode is really
+   done (as opposed to has been terminated prematurely due to some horizon/time-limit
+   setting).
+ - Return an additional `truncateds` per-agent dictionary flag from your `step()`
+   method, including the `__all__` key (100% analogous to your `dones/terminateds`
+   per-agent dict).
+   Return this new `truncateds` dict between `dones/terminateds` and `infos`. This
+   flag should indicate, whether the episode (for some agent or all agents) was
+   terminated prematurely due to some time constraint or other kind of horizon setting.
+"""  # noqa
+ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL = """Could not save keras model under self[TfPolicy].model.base_model!
+    This is either due to ..
+    a) .. this Policy's ModelV2 not having any `base_model` (tf.keras.Model) property
+    b) .. the ModelV2's `base_model` not being used by the Algorithm and thus its
+       variables not being properly initialized.
+"""  # noqa
+ERR_MSG_TORCH_POLICY_CANNOT_SAVE_MODEL = """Could not save torch model under self[TorchPolicy].model!
+    This is most likely due to the fact that you are using an Algorithm that
+    uses a Catalog-generated TorchModelV2 subclass, which is torch.save() cannot pickle.
+"""  # noqa
+# -------
+# HOWTO_ strings can be added to any error/warning/into message
+# to eplain to the user, how to actually fix the encountered problem.
+# -------
+# HOWTO change the RLlib config, depending on how user runs the job.
+HOWTO_CHANGE_CONFIG = """
+To change the config for `tune.Tuner().fit()` in a script: Modify the python dict
+  passed to `tune.Tuner(param_space=[...]).fit()`.
+To change the config for an RLlib Algorithm instance: Modify the python dict
+  passed to the Algorithm's constructor, e.g. `PPO(config=[...])`.
+"""

.venv/lib/python3.11/site-packages/ray/rllib/utils/filter_manager.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import logging
+from typing import Optional
+import ray
+from ray.rllib.utils.annotations import OldAPIStack
+logger = logging.getLogger(__name__)
+@OldAPIStack
+class FilterManager:
+    """Manages filters and coordination across remote evaluators that expose
+    `get_filters` and `sync_filters`.
+    """
+    @staticmethod
+    def synchronize(
+        local_filters,
+        worker_set,
+        update_remote=True,
+        timeout_seconds: Optional[float] = None,
+        use_remote_data_for_update: bool = True,
+    ):
+        """Aggregates filters from remote workers (if use_remote_data_for_update=True).
+        Local copy is updated and then broadcasted to all remote evaluators
+        (if `update_remote` is True).
+        Args:
+            local_filters: Filters to be synchronized.
+            worker_set: EnvRunnerGroup with remote EnvRunners with filters.
+            update_remote: Whether to push updates from the local filters to the remote
+                workers' filters.
+            timeout_seconds: How long to wait for filter to get or set filters
+            use_remote_data_for_update: Whether to use the `worker_set`'s remote workers
+                to update the local filters. If False, stats from the remote workers
+                will not be used and discarded.
+        """
+        # No sync/update required in either direction -> Early out.
+        if not (update_remote or use_remote_data_for_update):
+            return
+        logger.debug(f"Synchronizing filters: {local_filters}")
+        # Get the filters from the remote workers.
+        remote_filters = worker_set.foreach_env_runner(
+            func=lambda worker: worker.get_filters(flush_after=True),
+            local_env_runner=False,
+            timeout_seconds=timeout_seconds,
+        )
+        if len(remote_filters) != worker_set.num_healthy_remote_workers():
+            logger.error(
+                "Failed to get remote filters from a rollout worker in "
+                "FilterManager! "
+                "Filtered metrics may be computed, but filtered wrong."
+            )
+        # Should we utilize the remote workers' filter stats to update the local
+        # filters?
+        if use_remote_data_for_update:
+            for rf in remote_filters:
+                for k in local_filters:
+                    local_filters[k].apply_changes(rf[k], with_buffer=False)
+        # Should we update the remote workers' filters from the (now possibly synched)
+        # local filters?
+        if update_remote:
+            copies = {k: v.as_serializable() for k, v in local_filters.items()}
+            remote_copy = ray.put(copies)
+            logger.debug("Updating remote filters ...")
+            results = worker_set.foreach_env_runner(
+                func=lambda worker: worker.sync_filters(ray.get(remote_copy)),
+                local_env_runner=False,
+                timeout_seconds=timeout_seconds,
+            )
+            if len(results) != worker_set.num_healthy_remote_workers():
+                logger.error(
+                    "Failed to set remote filters to a rollout worker in "
+                    "FilterManager. "
+                    "Filtered metrics may be computed, but filtered wrong."
+                )