diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/observation_function.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/observation_function.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3e28fd77a6d874c21c8deb9faf988e8e0eb53db
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/observation_function.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sample_batch_builder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sample_batch_builder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ada321da8979f0a32c555dc1c986b5ad3a5da21
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/sample_batch_builder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/worker_set.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/worker_set.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7791fc17feb7f573f5f82c2a9288b420b27fe891
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/__pycache__/worker_set.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..327801a0ffdc19da86cc7e6c3f90170694c09118
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/agent_collector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/agent_collector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b22db4ec1586e5ac1f371538235c6d7b24462b3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/agent_collector.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/sample_collector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/sample_collector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41ca589ef40746112ca7cd6f0612dab60b48720a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/sample_collector.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/simple_list_collector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/simple_list_collector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1301013fd4111457d9606dcb95081296346cb0a4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/__pycache__/simple_list_collector.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/agent_collector.py b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/agent_collector.py
new file mode 100644
index 0000000000000000000000000000000000000000..0628cbcb9718cbae1e4539ca07f0f2f7b25a0989
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/agent_collector.py
@@ -0,0 +1,688 @@
+import copy
+import logging
+import math
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import tree  # pip install dm_tree
+from gymnasium.spaces import Space
+
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.spaces.space_utils import (
+    flatten_to_single_ndarray,
+    get_dummy_batch_for_space,
+)
+from ray.rllib.utils.typing import (
+    EpisodeID,
+    EnvID,
+    TensorType,
+    ViewRequirementsDict,
+)
+
+logger = logging.getLogger(__name__)
+
+torch, _ = try_import_torch()
+
+
+def _to_float_np_array(v: List[Any]) -> np.ndarray:
+    if torch and torch.is_tensor(v[0]):
+        raise ValueError
+    arr = np.array(v)
+    if arr.dtype == np.float64:
+        return arr.astype(np.float32)  # save some memory
+    return arr
+
+
+def _get_buffered_slice_with_paddings(d, inds):
+    element_at_t = []
+    for index in inds:
+        if index < len(d):
+            element_at_t.append(d[index])
+        else:
+            # zero pad similar to the last element.
+            element_at_t.append(tree.map_structure(np.zeros_like, d[-1]))
+    return element_at_t
+
+
+@OldAPIStack
+class AgentCollector:
+    """Collects samples for one agent in one trajectory (episode).
+
+    The agent may be part of a multi-agent environment. Samples are stored in
+    lists including some possible automatic "shift" buffer at the beginning to
+    be able to save memory when storing things like NEXT_OBS, PREV_REWARDS,
+    etc.., which are specified using the trajectory view API.
+    """
+
+    _next_unroll_id = 0  # disambiguates unrolls within a single episode
+
+    # TODO: @kourosh add different types of padding. e.g. zeros vs. same
+    def __init__(
+        self,
+        view_reqs: ViewRequirementsDict,
+        *,
+        max_seq_len: int = 1,
+        disable_action_flattening: bool = True,
+        intial_states: Optional[List[TensorType]] = None,
+        is_policy_recurrent: bool = False,
+        is_training: bool = True,
+        _enable_new_api_stack: bool = False,
+    ):
+        """Initialize an AgentCollector.
+
+        Args:
+            view_reqs: A dict of view requirements for the agent.
+            max_seq_len: The maximum sequence length to store.
+            disable_action_flattening: If True, don't flatten the action.
+            intial_states: The initial states from the policy.get_initial_states()
+            is_policy_recurrent: If True, the policy is recurrent.
+            is_training: Sets the is_training flag for the buffers. if True, all the
+                timesteps are stored in the buffers until explictly build_for_training
+                () is called. if False, only the content required for the last time
+                step is stored in the buffers. This will save memory during inference.
+                You can change the behavior at runtime by calling is_training(mode).
+        """
+        self.max_seq_len = max_seq_len
+        self.disable_action_flattening = disable_action_flattening
+        self.view_requirements = view_reqs
+        # The initial_states can be an np array
+        self.initial_states = intial_states if intial_states is not None else []
+        self.is_policy_recurrent = is_policy_recurrent
+        self._is_training = is_training
+        self._enable_new_api_stack = _enable_new_api_stack
+
+        # Determine the size of the buffer we need for data before the actual
+        # episode starts. This is used for 0-buffering of e.g. prev-actions,
+        # or internal state inputs.
+        view_req_shifts = [
+            min(vr.shift_arr)
+            - int((vr.data_col or k) in [SampleBatch.OBS, SampleBatch.INFOS])
+            for k, vr in view_reqs.items()
+        ]
+        self.shift_before = -min(view_req_shifts)
+
+        # The actual data buffers. Keys are column names, values are lists
+        # that contain the sub-components (e.g. for complex obs spaces) with
+        # each sub-component holding a list of per-timestep tensors.
+        # E.g.: obs-space = Dict(a=Discrete(2), b=Box((2,)))
+        # buffers["obs"] = [
+        #    [0, 1],  # <- 1st sub-component of observation
+        #    [np.array([.2, .3]), np.array([.0, -.2])]  # <- 2nd sub-component
+        # ]
+        # NOTE: infos and state_out... are not flattened due to them often
+        # using custom dict values whose structure may vary from timestep to
+        # timestep.
+        self.buffers: Dict[str, List[List[TensorType]]] = {}
+        # Maps column names to an example data item, which may be deeply
+        # nested. These are used such that we'll know how to unflatten
+        # the flattened data inside self.buffers when building the
+        # SampleBatch.
+        self.buffer_structs: Dict[str, Any] = {}
+        # The episode ID for the agent for which we collect data.
+        self.episode_id = None
+        # The unroll ID, unique across all rollouts (within a RolloutWorker).
+        self.unroll_id = None
+        # The simple timestep count for this agent. Gets increased by one
+        # each time a (non-initial!) observation is added.
+        self.agent_steps = 0
+        # Keep track of view requirements that have a view on columns that we gain from
+        # inference and also need for inference. These have dummy values appended in
+        # buffers to account for the missing value when building for inference
+        # Example: We have one 'state_in' view requirement that has a view on our
+        # state_outs at t=[-10, ..., -1]. At any given build_for_inference()-call,
+        # the buffer must contain eleven values from t=[-10, ..., 0] for us to index
+        # properly. Since state_out at t=0 is missing, we substitute it with a buffer
+        # value that should never make it into batches built for training.
+        self.data_cols_with_dummy_values = set()
+
+    @property
+    def training(self) -> bool:
+        return self._is_training
+
+    def is_training(self, is_training: bool) -> None:
+        self._is_training = is_training
+
+    def is_empty(self) -> bool:
+        """Returns True if this collector has no data."""
+        return not self.buffers or all(len(item) == 0 for item in self.buffers.values())
+
+    def add_init_obs(
+        self,
+        episode_id: EpisodeID,
+        agent_index: int,
+        env_id: EnvID,
+        init_obs: TensorType,
+        init_infos: Optional[Dict[str, TensorType]] = None,
+        t: int = -1,
+    ) -> None:
+        """Adds an initial observation (after reset) to the Agent's trajectory.
+
+        Args:
+            episode_id: Unique ID for the episode we are adding the
+                initial observation for.
+            agent_index: Unique int index (starting from 0) for the agent
+                within its episode. Not to be confused with AGENT_ID (Any).
+            env_id: The environment index (in a vectorized setup).
+            init_obs: The initial observation tensor (after `env.reset()`).
+            init_infos: The initial infos dict (after `env.reset()`).
+            t: The time step (episode length - 1). The initial obs has
+                ts=-1(!), then an action/reward/next-obs at t=0, etc..
+        """
+        # Store episode ID + unroll ID, which will be constant throughout this
+        # AgentCollector's lifecycle.
+        self.episode_id = episode_id
+        if self.unroll_id is None:
+            self.unroll_id = AgentCollector._next_unroll_id
+            AgentCollector._next_unroll_id += 1
+
+        # convert init_obs to np.array (in case it is a list)
+        if isinstance(init_obs, list):
+            init_obs = np.array(init_obs)
+
+        if SampleBatch.OBS not in self.buffers:
+            single_row = {
+                SampleBatch.OBS: init_obs,
+                SampleBatch.INFOS: init_infos or {},
+                SampleBatch.AGENT_INDEX: agent_index,
+                SampleBatch.ENV_ID: env_id,
+                SampleBatch.T: t,
+                SampleBatch.EPS_ID: self.episode_id,
+                SampleBatch.UNROLL_ID: self.unroll_id,
+            }
+
+            # TODO (Artur): Remove when PREV_ACTIONS and PREV_REWARDS get deprecated.
+            # Note (Artur): As long as we have these in our default view requirements,
+            # we should  build buffers with neutral elements instead of building them
+            # on the first AgentCollector.build_for_inference call if present.
+            # This prevents us from accidentally building buffers with duplicates of
+            # the first incoming value.
+            if SampleBatch.PREV_REWARDS in self.view_requirements:
+                single_row[SampleBatch.REWARDS] = get_dummy_batch_for_space(
+                    space=self.view_requirements[SampleBatch.REWARDS].space,
+                    batch_size=0,
+                    fill_value=0.0,
+                )
+            if SampleBatch.PREV_ACTIONS in self.view_requirements:
+                potentially_flattened_batch = get_dummy_batch_for_space(
+                    space=self.view_requirements[SampleBatch.ACTIONS].space,
+                    batch_size=0,
+                    fill_value=0.0,
+                )
+                if not self.disable_action_flattening:
+                    potentially_flattened_batch = flatten_to_single_ndarray(
+                        potentially_flattened_batch
+                    )
+                single_row[SampleBatch.ACTIONS] = potentially_flattened_batch
+            self._build_buffers(single_row)
+
+        # Append data to existing buffers.
+        flattened = tree.flatten(init_obs)
+        for i, sub_obs in enumerate(flattened):
+            self.buffers[SampleBatch.OBS][i].append(sub_obs)
+        self.buffers[SampleBatch.INFOS][0].append(init_infos or {})
+        self.buffers[SampleBatch.AGENT_INDEX][0].append(agent_index)
+        self.buffers[SampleBatch.ENV_ID][0].append(env_id)
+        self.buffers[SampleBatch.T][0].append(t)
+        self.buffers[SampleBatch.EPS_ID][0].append(self.episode_id)
+        self.buffers[SampleBatch.UNROLL_ID][0].append(self.unroll_id)
+
+    def add_action_reward_next_obs(self, input_values: Dict[str, TensorType]) -> None:
+        """Adds the given dictionary (row) of values to the Agent's trajectory.
+
+        Args:
+            values: Data dict (interpreted as a single row) to be added to buffer.
+                Must contain keys:
+                SampleBatch.ACTIONS, REWARDS, TERMINATEDS, TRUNCATEDS, and NEXT_OBS.
+        """
+        if self.unroll_id is None:
+            self.unroll_id = AgentCollector._next_unroll_id
+            AgentCollector._next_unroll_id += 1
+
+        # Next obs -> obs.
+        values = copy.copy(input_values)
+        assert SampleBatch.OBS not in values
+        values[SampleBatch.OBS] = values[SampleBatch.NEXT_OBS]
+        del values[SampleBatch.NEXT_OBS]
+
+        # convert obs to np.array (in case it is a list)
+        if isinstance(values[SampleBatch.OBS], list):
+            values[SampleBatch.OBS] = np.array(values[SampleBatch.OBS])
+
+        # Default to next timestep if not provided in input values
+        if SampleBatch.T not in input_values:
+            values[SampleBatch.T] = self.buffers[SampleBatch.T][0][-1] + 1
+
+        # Make sure EPS_ID/UNROLL_ID stay the same for this agent.
+        if SampleBatch.EPS_ID in values:
+            assert values[SampleBatch.EPS_ID] == self.episode_id
+            del values[SampleBatch.EPS_ID]
+        self.buffers[SampleBatch.EPS_ID][0].append(self.episode_id)
+        if SampleBatch.UNROLL_ID in values:
+            assert values[SampleBatch.UNROLL_ID] == self.unroll_id
+            del values[SampleBatch.UNROLL_ID]
+        self.buffers[SampleBatch.UNROLL_ID][0].append(self.unroll_id)
+
+        for k, v in values.items():
+            if k not in self.buffers:
+                if self.training and k.startswith("state_out"):
+                    vr = self.view_requirements[k]
+                    data_col = vr.data_col or k
+                    self._fill_buffer_with_initial_values(
+                        data_col, vr, build_for_inference=False
+                    )
+                else:
+                    self._build_buffers({k: v})
+            # Do not flatten infos, state_out and (if configured) actions.
+            # Infos/state-outs may be structs that change from timestep to
+            # timestep.
+            should_flatten_action_key = (
+                k == SampleBatch.ACTIONS and not self.disable_action_flattening
+            )
+            # Note (Artur) RL Modules's states need no flattening
+            should_flatten_state_key = (
+                k.startswith("state_out") and not self._enable_new_api_stack
+            )
+            if (
+                k == SampleBatch.INFOS
+                or should_flatten_state_key
+                or should_flatten_action_key
+            ):
+                if should_flatten_action_key:
+                    v = flatten_to_single_ndarray(v)
+                # Briefly remove dummy value to add to buffer
+                if k in self.data_cols_with_dummy_values:
+                    dummy = self.buffers[k][0].pop(-1)
+                self.buffers[k][0].append(v)
+                # Add back dummy value
+                if k in self.data_cols_with_dummy_values:
+                    self.buffers[k][0].append(dummy)
+            # Flatten all other columns.
+            else:
+                flattened = tree.flatten(v)
+                for i, sub_list in enumerate(self.buffers[k]):
+                    # Briefly remove dummy value to add to buffer
+                    if k in self.data_cols_with_dummy_values:
+                        dummy = sub_list.pop(-1)
+                    sub_list.append(flattened[i])
+                    # Add back dummy value
+                    if k in self.data_cols_with_dummy_values:
+                        sub_list.append(dummy)
+
+        # In inference mode, we don't need to keep all of trajectory in memory
+        # we only need to keep the steps required. We can pop from the beginning to
+        # create room for new data.
+        if not self.training:
+            for k in self.buffers:
+                for sub_list in self.buffers[k]:
+                    if sub_list:
+                        sub_list.pop(0)
+
+        self.agent_steps += 1
+
+    def build_for_inference(self) -> SampleBatch:
+        """During inference, we will build a SampleBatch with a batch size of 1 that
+        can then be used to run the forward pass of a policy. This data will only
+        include the enviornment context for running the policy at the last timestep.
+
+        Returns:
+            A SampleBatch with a batch size of 1.
+        """
+
+        batch_data = {}
+        np_data = {}
+        for view_col, view_req in self.view_requirements.items():
+            # Create the batch of data from the different buffers.
+            data_col = view_req.data_col or view_col
+
+            # if this view is not for inference, skip it.
+            if not view_req.used_for_compute_actions:
+                continue
+
+            if np.any(view_req.shift_arr > 0):
+                raise ValueError(
+                    f"During inference the agent can only use past observations to "
+                    f"respect causality. However, view_col = {view_col} seems to "
+                    f"depend on future indices {view_req.shift_arr}, while the "
+                    f"used_for_compute_actions flag is set to True. Please fix the "
+                    f"discrepancy. Hint: If you are using a custom model make sure "
+                    f"the view_requirements are initialized properly and is point "
+                    f"only refering to past timesteps during inference."
+                )
+
+            # Some columns don't exist yet
+            # (get created during postprocessing or depend on state_out).
+            if data_col not in self.buffers:
+                self._fill_buffer_with_initial_values(
+                    data_col, view_req, build_for_inference=True
+                )
+                self._prepare_for_data_cols_with_dummy_values(data_col)
+
+            # Keep an np-array cache, so we don't have to regenerate the
+            # np-array for different view_cols using to the same data_col.
+            self._cache_in_np(np_data, data_col)
+
+            data = []
+            for d in np_data[data_col]:
+                # if shift_arr = [0] the data will be just the last time step
+                # (len(d) - 1), if shift_arr = [-1] the data will be just the timestep
+                # before the last one (len(d) - 2) and so on.
+                element_at_t = d[view_req.shift_arr + len(d) - 1]
+                if element_at_t.shape[0] == 1:
+                    # We'd normally squeeze here to remove the time dim, but we'll
+                    # simply use the time dim as the batch dim.
+                    data.append(element_at_t)
+                    continue
+                # add the batch dimension with [None]
+                data.append(element_at_t[None])
+
+            # We unflatten even if data is empty here, because the structure might be
+            # nested with empty leafs and so we still need to reconstruct it.
+            # This is useful because we spec-check states in RLModules and these
+            # states can sometimes be nested dicts with empty leafs.
+            batch_data[view_col] = self._unflatten_as_buffer_struct(data, data_col)
+
+        batch = self._get_sample_batch(batch_data)
+        return batch
+
+    # TODO: @kouorsh we don't really need view_requirements anymore since it's already
+    # an attribute of the class
+    def build_for_training(
+        self, view_requirements: ViewRequirementsDict
+    ) -> SampleBatch:
+        """Builds a SampleBatch from the thus-far collected agent data.
+
+        If the episode/trajectory has no TERMINATED|TRUNCATED=True at the end, will
+        copy the necessary n timesteps at the end of the trajectory back to the
+        beginning of the buffers and wait for new samples coming in.
+        SampleBatches created by this method will be ready for postprocessing
+        by a Policy.
+
+        Args:
+            view_requirements: The viewrequirements dict needed to build the
+            SampleBatch from the raw buffers (which may have data shifts as well as
+            mappings from view-col to data-col in them).
+
+        Returns:
+            SampleBatch: The built SampleBatch for this agent, ready to go into
+            postprocessing.
+        """
+        batch_data = {}
+        np_data = {}
+        for view_col, view_req in view_requirements.items():
+            # Create the batch of data from the different buffers.
+            data_col = view_req.data_col or view_col
+
+            if data_col not in self.buffers:
+                is_state = self._fill_buffer_with_initial_values(
+                    data_col, view_req, build_for_inference=False
+                )
+
+                # We need to skip this view_col if it does not exist in the buffers and
+                # is not an RNN state because it could be the special keys that gets
+                # added by policy's postprocessing function for training.
+                if not is_state:
+                    continue
+
+            # OBS and INFOS are already shifted by -1 (the initial obs/info starts one
+            # ts before all other data columns).
+            obs_shift = -1 if data_col in [SampleBatch.OBS, SampleBatch.INFOS] else 0
+
+            # Keep an np-array cache so we don't have to regenerate the
+            # np-array for different view_cols using to the same data_col.
+            self._cache_in_np(np_data, data_col)
+
+            # Go through each time-step in the buffer and construct the view
+            # accordingly.
+            data = []
+            for d in np_data[data_col]:
+                shifted_data = []
+
+                # batch_repeat_value determines how many time steps should we skip
+                # before we repeat indexing the data.
+                # Example: batch_repeat_value=10, shift_arr = [-3, -2, -1],
+                # shift_before = 3
+                # buffer = [-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+                # resulting_data = [[-3, -2, -1], [7, 8, 9]]
+                # explanation: For t=0, we output [-3, -2, -1]. We then skip 10 time
+                # steps ahead and get to t=10. For t=10, we output [7, 8, 9]. We skip
+                # 10 more time steps and get to t=20. but since t=20 is out of bound we
+                # stop.
+
+                # count computes the number of time steps that we need to consider.
+                # if batch_repeat_value = 1, this number should be the length of
+                # episode so far, which is len(buffer) - shift_before (-1 if this
+                # value was gained during inference. This is because we keep a dummy
+                # value at the last position of the buffer that makes it one longer).
+                count = int(
+                    math.ceil(
+                        (
+                            len(d)
+                            - int(data_col in self.data_cols_with_dummy_values)
+                            - self.shift_before
+                        )
+                        / view_req.batch_repeat_value
+                    )
+                )
+                for i in range(count):
+                    # the indices for time step t
+                    inds = (
+                        self.shift_before
+                        + obs_shift
+                        + view_req.shift_arr
+                        + (i * view_req.batch_repeat_value)
+                    )
+
+                    # handle the case where the inds are out of bounds from the end.
+                    # if during the indexing any of the indices are out of bounds, we
+                    # need to use padding on the end to fill in the missing indices.
+                    # Create padding first time we encounter data
+                    if max(inds) < len(d):
+                        # Simple case where we can simply pick slices from buffer
+                        element_at_t = d[inds]
+                    else:
+                        # Case in which we have to pad because buffer has insufficient
+                        # length. This branch takes more time than simply picking
+                        # slices we try to avoid it.
+                        element_at_t = _get_buffered_slice_with_paddings(d, inds)
+                        element_at_t = np.stack(element_at_t)
+
+                    if element_at_t.shape[0] == 1:
+                        # Remove the T dimension if it is 1.
+                        element_at_t = element_at_t[0]
+                    shifted_data.append(element_at_t)
+
+                # in some multi-agent cases shifted_data may be an empty list.
+                # In this case we should just create an empty array and return it.
+                if shifted_data:
+                    shifted_data_np = np.stack(shifted_data, 0)
+                else:
+                    shifted_data_np = np.array(shifted_data)
+                data.append(shifted_data_np)
+
+            # We unflatten even if data is empty here, because the structure might be
+            # nested with empty leafs and so we still need to reconstruct it.
+            # This is useful because we spec-check states in RLModules and these
+            # states can sometimes be nested dicts with empty leafs.
+            batch_data[view_col] = self._unflatten_as_buffer_struct(data, data_col)
+
+        batch = self._get_sample_batch(batch_data)
+
+        # This trajectory is continuing -> Copy data at the end (in the size of
+        # self.shift_before) to the beginning of buffers and erase everything
+        # else.
+        if (
+            SampleBatch.TERMINATEDS in self.buffers
+            and not self.buffers[SampleBatch.TERMINATEDS][0][-1]
+            and SampleBatch.TRUNCATEDS in self.buffers
+            and not self.buffers[SampleBatch.TRUNCATEDS][0][-1]
+        ):
+            # Copy data to beginning of buffer and cut lists.
+            if self.shift_before > 0:
+                for k, data in self.buffers.items():
+                    # Loop through
+                    for i in range(len(data)):
+                        self.buffers[k][i] = data[i][-self.shift_before :]
+            self.agent_steps = 0
+
+        # Reset our unroll_id.
+        self.unroll_id = None
+
+        return batch
+
+    def _build_buffers(self, single_row: Dict[str, TensorType]) -> None:
+        """Builds the buffers for sample collection, given an example data row.
+
+        Args:
+            single_row (Dict[str, TensorType]): A single row (keys=column
+                names) of data to base the buffers on.
+        """
+        for col, data in single_row.items():
+            if col in self.buffers:
+                continue
+
+            shift = self.shift_before - (
+                1
+                if col
+                in [
+                    SampleBatch.OBS,
+                    SampleBatch.INFOS,
+                    SampleBatch.EPS_ID,
+                    SampleBatch.AGENT_INDEX,
+                    SampleBatch.ENV_ID,
+                    SampleBatch.T,
+                    SampleBatch.UNROLL_ID,
+                ]
+                else 0
+            )
+
+            # Store all data as flattened lists, except INFOS and state-out
+            # lists. These are monolithic items (infos is a dict that
+            # should not be further split, same for state-out items, which
+            # could be custom dicts as well).
+            should_flatten_action_key = (
+                col == SampleBatch.ACTIONS and not self.disable_action_flattening
+            )
+            # Note (Artur) RL Modules's states need no flattening
+            should_flatten_state_key = (
+                col.startswith("state_out") and not self._enable_new_api_stack
+            )
+            if (
+                col == SampleBatch.INFOS
+                or should_flatten_state_key
+                or should_flatten_action_key
+            ):
+                if should_flatten_action_key:
+                    data = flatten_to_single_ndarray(data)
+                self.buffers[col] = [[data for _ in range(shift)]]
+            else:
+                self.buffers[col] = [
+                    [v for _ in range(shift)] for v in tree.flatten(data)
+                ]
+                # Store an example data struct so we know, how to unflatten
+                # each data col.
+                self.buffer_structs[col] = data
+
+    def _get_sample_batch(self, batch_data: Dict[str, TensorType]) -> SampleBatch:
+        """Returns a SampleBatch from the given data dictionary. Also updates the
+        sequence information based on the max_seq_len."""
+
+        # Due to possible batch-repeats > 1, columns in the resulting batch
+        # may not all have the same batch size.
+        batch = SampleBatch(batch_data, is_training=self.training)
+
+        # Adjust the seq-lens array depending on the incoming agent sequences.
+        if self.is_policy_recurrent:
+            seq_lens = []
+            max_seq_len = self.max_seq_len
+            count = batch.count
+            while count > 0:
+                seq_lens.append(min(count, max_seq_len))
+                count -= max_seq_len
+            batch["seq_lens"] = np.array(seq_lens)
+            batch.max_seq_len = max_seq_len
+
+        return batch
+
+    def _cache_in_np(self, cache_dict: Dict[str, List[np.ndarray]], key: str) -> None:
+        """Caches the numpy version of the key in the buffer dict."""
+        if key not in cache_dict:
+            cache_dict[key] = [_to_float_np_array(d) for d in self.buffers[key]]
+
+    def _unflatten_as_buffer_struct(
+        self, data: List[np.ndarray], key: str
+    ) -> np.ndarray:
+        """Unflattens the given to match the buffer struct format for that key."""
+        if key not in self.buffer_structs:
+            return data[0]
+
+        return tree.unflatten_as(self.buffer_structs[key], data)
+
+    def _fill_buffer_with_initial_values(
+        self,
+        data_col: str,
+        view_requirement: ViewRequirement,
+        build_for_inference: bool = False,
+    ) -> bool:
+        """Fills the buffer with the initial values for the given data column.
+        for dat_col starting with `state_out`, use the initial states of the policy,
+        but for other data columns, create a dummy value based on the view requirement
+        space.
+
+        Args:
+            data_col: The data column to fill the buffer with.
+            view_requirement: The view requirement for the view_col. Normally the view
+                requirement for the data column is used and if it does not exist for
+                some reason the view requirement for view column is used instead.
+            build_for_inference: Whether this is getting called for inference or not.
+
+        returns:
+            is_state: True if the data_col is an RNN state, False otherwise.
+        """
+        try:
+            space = self.view_requirements[data_col].space
+        except KeyError:
+            space = view_requirement.space
+
+        # special treatment for state_out
+        # add them to the buffer in case they don't exist yet
+        is_state = True
+        if data_col.startswith("state_out"):
+            if self._enable_new_api_stack:
+                self._build_buffers({data_col: self.initial_states})
+            else:
+                if not self.is_policy_recurrent:
+                    raise ValueError(
+                        f"{data_col} is not available, because the given policy is"
+                        f"not recurrent according to the input model_inital_states."
+                        f"Have you forgotten to return non-empty lists in"
+                        f"policy.get_initial_states()?"
+                    )
+                state_ind = int(data_col.split("_")[-1])
+                self._build_buffers({data_col: self.initial_states[state_ind]})
+        else:
+            is_state = False
+            # only create dummy data during inference
+            if build_for_inference:
+                if isinstance(space, Space):
+                    #  state_out assumes the values do not have a batch dimension
+                    #  (i.e. instead of being (1, d) it is of shape (d,).
+                    fill_value = get_dummy_batch_for_space(
+                        space,
+                        batch_size=0,
+                    )
+                else:
+                    fill_value = space
+
+                self._build_buffers({data_col: fill_value})
+
+        return is_state
+
+    def _prepare_for_data_cols_with_dummy_values(self, data_col):
+        self.data_cols_with_dummy_values.add(data_col)
+        # For items gained during inference, we append a dummy value here so
+        # that view requirements viewing these is not shifted by 1
+        for b in self.buffers[data_col]:
+            b.append(b[-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/sample_collector.py b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/sample_collector.py
new file mode 100644
index 0000000000000000000000000000000000000000..75dbb5d040a5a11e042a8d0cd18ca640f828eefd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/evaluation/collectors/sample_collector.py
@@ -0,0 +1,298 @@
+import logging
+from abc import ABCMeta, abstractmethod
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
+
+from ray.rllib.policy.policy_map import PolicyMap
+from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.typing import AgentID, EnvID, EpisodeID, PolicyID, TensorType
+
+if TYPE_CHECKING:
+    from ray.rllib.callbacks.callbacks import RLlibCallback
+
+logger = logging.getLogger(__name__)
+
+
+# fmt: off
+# __sphinx_doc_begin__
+@OldAPIStack
+class SampleCollector(metaclass=ABCMeta):
+    """Collects samples for all policies and agents from a multi-agent env.
+
+    This API is controlled by RolloutWorker objects to store all data
+    generated by Environments and Policies/Models during rollout and
+    postprocessing. It's purposes are to a) make data collection and
+    SampleBatch/input_dict generation from this data faster, b) to unify
+    the way we collect samples from environments and model (outputs), thereby
+    allowing for possible user customizations, c) to allow for more complex
+    inputs fed into different policies (e.g. multi-agent case with inter-agent
+    communication channel).
+    """
+
+    def __init__(self,
+                 policy_map: PolicyMap,
+                 clip_rewards: Union[bool, float],
+                 callbacks: "RLlibCallback",
+                 multiple_episodes_in_batch: bool = True,
+                 rollout_fragment_length: int = 200,
+                 count_steps_by: str = "env_steps"):
+        """Initializes a SampleCollector instance.
+
+        Args:
+            policy_map: Maps policy ids to policy instances.
+            clip_rewards (Union[bool, float]): Whether to clip rewards before
+                postprocessing (at +/-1.0) or the actual value to +/- clip.
+            callbacks: RLlib callbacks.
+            multiple_episodes_in_batch: Whether it's allowed to pack
+                multiple episodes into the same built batch.
+            rollout_fragment_length: The
+
+        """
+
+        self.policy_map = policy_map
+        self.clip_rewards = clip_rewards
+        self.callbacks = callbacks
+        self.multiple_episodes_in_batch = multiple_episodes_in_batch
+        self.rollout_fragment_length = rollout_fragment_length
+        self.count_steps_by = count_steps_by
+
+    @abstractmethod
+    def add_init_obs(
+        self,
+        *,
+        episode,
+        agent_id: AgentID,
+        policy_id: PolicyID,
+        init_obs: TensorType,
+        init_infos: Optional[Dict[str, TensorType]] = None,
+        t: int = -1,
+    ) -> None:
+        """Adds an initial obs (after reset) to this collector.
+
+        Since the very first observation in an environment is collected w/o
+        additional data (w/o actions, w/o reward) after env.reset() is called,
+        this method initializes a new trajectory for a given agent.
+        `add_init_obs()` has to be called first for each agent/episode-ID
+        combination. After this, only `add_action_reward_next_obs()` must be
+        called for that same agent/episode-pair.
+
+        Args:
+            episode: The Episode, for which we
+                are adding an Agent's initial observation.
+            agent_id: Unique id for the agent we are adding
+                values for.
+            env_id: The environment index (in a vectorized setup).
+            policy_id: Unique id for policy controlling the agent.
+            init_obs: Initial observation (after env.reset()).
+            init_obs: Initial observation (after env.reset()).
+            init_infos: Initial infos dict (after env.reset()).
+            t: The time step (episode length - 1). The initial obs has
+                ts=-1(!), then an action/reward/next-obs at t=0, etc..
+
+        .. testcode::
+            :skipif: True
+
+            obs, infos = env.reset()
+            collector.add_init_obs(
+                episode=my_episode,
+                agent_id=0,
+                policy_id="pol0",
+                t=-1,
+                init_obs=obs,
+                init_infos=infos,
+            )
+            obs, r, terminated, truncated, info = env.step(action)
+            collector.add_action_reward_next_obs(12345, 0, "pol0", False, {
+                "action": action, "obs": obs, "reward": r, "terminated": terminated,
+                "truncated": truncated, "info": info
+            })
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_action_reward_next_obs(
+        self,
+        episode_id: EpisodeID,
+        agent_id: AgentID,
+        env_id: EnvID,
+        policy_id: PolicyID,
+        agent_done: bool,
+        values: Dict[str, TensorType],
+    ) -> None:
+        """Add the given dictionary (row) of values to this collector.
+
+        The incoming data (`values`) must include action, reward, terminated, truncated,
+        and next_obs information and may include any other information.
+        For the initial observation (after Env.reset()) of the given agent/
+        episode-ID combination, `add_initial_obs()` must be called instead.
+
+        Args:
+            episode_id: Unique id for the episode we are adding
+                values for.
+            agent_id: Unique id for the agent we are adding
+                values for.
+            env_id: The environment index (in a vectorized setup).
+            policy_id: Unique id for policy controlling the agent.
+            agent_done: Whether the given agent is done (terminated or truncated) with
+                its trajectory (the multi-agent episode may still be ongoing).
+            values (Dict[str, TensorType]): Row of values to add for this
+                agent. This row must contain the keys SampleBatch.ACTION,
+                REWARD, NEW_OBS, TERMINATED, and TRUNCATED.
+
+        .. testcode::
+            :skipif: True
+
+            obs, info = env.reset()
+            collector.add_init_obs(12345, 0, "pol0", obs)
+            obs, r, terminated, truncated, info = env.step(action)
+            collector.add_action_reward_next_obs(
+                12345,
+                0,
+                "pol0",
+                agent_done=False,
+                values={
+                    "action": action, "obs": obs, "reward": r,
+                    "terminated": terminated, "truncated": truncated
+                },
+            )
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def episode_step(self, episode) -> None:
+        """Increases the episode step counter (across all agents) by one.
+
+        Args:
+            episode: Episode we are stepping through.
+                Useful for handling counting b/c it is called once across
+                all agents that are inside this episode.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def total_env_steps(self) -> int:
+        """Returns total number of env-steps taken so far.
+
+        Thereby, a step in an N-agent multi-agent environment counts as only 1
+        for this metric. The returned count contains everything that has not
+        been built yet (and returned as MultiAgentBatches by the
+        `try_build_truncated_episode_multi_agent_batch` or
+        `postprocess_episode(build=True)` methods). After such build, this
+        counter is reset to 0.
+
+        Returns:
+            int: The number of env-steps taken in total in the environment(s)
+                so far.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def total_agent_steps(self) -> int:
+        """Returns total number of (individual) agent-steps taken so far.
+
+        Thereby, a step in an N-agent multi-agent environment counts as N.
+        If less than N agents have stepped (because some agents were not
+        required to send actions), the count will be increased by less than N.
+        The returned count contains everything that has not been built yet
+        (and returned as MultiAgentBatches by the
+        `try_build_truncated_episode_multi_agent_batch` or
+        `postprocess_episode(build=True)` methods). After such build, this
+        counter is reset to 0.
+
+        Returns:
+            int: The number of (individual) agent-steps taken in total in the
+                environment(s) so far.
+        """
+        raise NotImplementedError
+
+    # TODO(jungong) : Remove this API call once we completely move to
+    #  connector based sample collection.
+    @abstractmethod
+    def get_inference_input_dict(self, policy_id: PolicyID) -> \
+            Dict[str, TensorType]:
+        """Returns an input_dict for an (inference) forward pass from our data.
+
+        The input_dict can then be used for action computations inside a
+        Policy via `Policy.compute_actions_from_input_dict()`.
+
+        Args:
+            policy_id: The Policy ID to get the input dict for.
+
+        Returns:
+            Dict[str, TensorType]: The input_dict to be passed into the ModelV2
+                for inference/training.
+
+        .. testcode::
+            :skipif: True
+
+            obs, r, terminated, truncated, info = env.step(action)
+            collector.add_action_reward_next_obs(12345, 0, "pol0", False, {
+                "action": action, "obs": obs, "reward": r,
+                "terminated": terminated, "truncated", truncated
+            })
+            input_dict = collector.get_inference_input_dict(policy.model)
+            action = policy.compute_actions_from_input_dict(input_dict)
+            # repeat
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def postprocess_episode(
+        self,
+        episode,
+        is_done: bool = False,
+        check_dones: bool = False,
+        build: bool = False,
+    ) -> Optional[MultiAgentBatch]:
+        """Postprocesses all agents' trajectories in a given episode.
+
+        Generates (single-trajectory) SampleBatches for all Policies/Agents and
+        calls Policy.postprocess_trajectory on each of these. Postprocessing
+        may happens in-place, meaning any changes to the viewed data columns
+        are directly reflected inside this collector's buffers.
+        Also makes sure that additional (newly created) data columns are
+        correctly added to the buffers.
+
+        Args:
+            episode: The Episode object for which
+                to post-process data.
+            is_done: Whether the given episode is actually terminated
+                (all agents are terminated OR truncated). If True, the
+                episode will no longer be used/continued and we may need to
+                recycle/erase it internally. If a soft-horizon is hit, the
+                episode will continue to be used and `is_done` should be set
+                to False here.
+            check_dones: Whether we need to check that all agents'
+                trajectories have dones=True at the end.
+            build: Whether to build a MultiAgentBatch from the given
+                episode (and only that episode!) and return that
+                MultiAgentBatch. Used for batch_mode=`complete_episodes`.
+
+        Returns:
+            Optional[MultiAgentBatch]: If `build` is True, the
+                SampleBatch or MultiAgentBatch built from `episode` (either
+                just from that episde or from the `_PolicyCollectorGroup`
+                in the `episode.batch_builder` property).
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def try_build_truncated_episode_multi_agent_batch(self) -> \
+            List[Union[MultiAgentBatch, SampleBatch]]:
+        """Tries to build an MA-batch, if `rollout_fragment_length` is reached.
+
+        Any unprocessed data will be first postprocessed with a policy
+        postprocessor.
+        This is usually called to collect samples for policy training.
+        If not enough data has been collected yet (`rollout_fragment_length`),
+        returns an empty list.
+
+        Returns:
+            List[Union[MultiAgentBatch, SampleBatch]]: Returns a (possibly
+                empty) list of MultiAgentBatches (containing the accumulated
+                SampleBatches for each policy or a simple SampleBatch if only
+                one policy). The list will be empty if
+                `self.rollout_fragment_length` has not been reached yet.
+        """
+        raise NotImplementedError
+# __sphinx_doc_end__
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/complex_input_net.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/complex_input_net.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a6f376f06cdaa3ca8b2ec20eb536f147a96c860
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/complex_input_net.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_modelv2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_modelv2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a3832b1a02860ce538b733b58a8243803b60c6f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/torch_modelv2.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2585dcc77abe4acd4cd6daf49f902a4117a1438c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__init__.py
@@ -0,0 +1,13 @@
+from ray.rllib.models.torch.modules.gru_gate import GRUGate
+from ray.rllib.models.torch.modules.multi_head_attention import MultiHeadAttention
+from ray.rllib.models.torch.modules.relative_multi_head_attention import (
+    RelativeMultiHeadAttention,
+)
+from ray.rllib.models.torch.modules.skip_connection import SkipConnection
+
+__all__ = [
+    "GRUGate",
+    "RelativeMultiHeadAttention",
+    "SkipConnection",
+    "MultiHeadAttention",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f3d4b594e9b1564ef533d30378699b76a21af9c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/gru_gate.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/gru_gate.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..917dd476eb69d0c42b9edf3afb833d9a3f308e8e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/gru_gate.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/multi_head_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/multi_head_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e6602e0c0fcf30bc13ce25abafd1a3e8d2c2916
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/multi_head_attention.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/noisy_layer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/noisy_layer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e985f98126a4489651e70e023beb2962a4e52db
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/noisy_layer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/relative_multi_head_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/relative_multi_head_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bed8beed66cddd8a4948e908f6549159abc9121
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/relative_multi_head_attention.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/skip_connection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/skip_connection.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef20490676b30b24725529e07930a564df135ad0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/__pycache__/skip_connection.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/skip_connection.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/skip_connection.py
new file mode 100644
index 0000000000000000000000000000000000000000..444c1680686153b04d44abe61fb581b94fbc49b3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/skip_connection.py
@@ -0,0 +1,43 @@
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType
+from typing import Optional
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class SkipConnection(nn.Module):
+    """Skip connection layer.
+
+    Adds the original input to the output (regular residual layer) OR uses
+    input as hidden state input to a given fan_in_layer.
+    """
+
+    def __init__(
+        self, layer: nn.Module, fan_in_layer: Optional[nn.Module] = None, **kwargs
+    ):
+        """Initializes a SkipConnection nn Module object.
+
+        Args:
+            layer (nn.Module): Any layer processing inputs.
+            fan_in_layer (Optional[nn.Module]): An optional
+                layer taking two inputs: The original input and the output
+                of `layer`.
+        """
+        super().__init__(**kwargs)
+        self._layer = layer
+        self._fan_in_layer = fan_in_layer
+
+    def forward(self, inputs: TensorType, **kwargs) -> TensorType:
+        # del kwargs
+        outputs = self._layer(inputs, **kwargs)
+        # Residual case, just add inputs to outputs.
+        if self._fan_in_layer is None:
+            outputs = outputs + inputs
+        # Fan-in e.g. RNN: Call fan-in with `inputs` and `outputs`.
+        else:
+            # NOTE: In the GRU case, `inputs` is the state input.
+            outputs = self._fan_in_layer((inputs, outputs))
+
+        return outputs
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ec50561a0adbcf99b7e44bb331a3f14dc2017c1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/d4rl_reader.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/d4rl_reader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45b14dce734bf8f63596dc175b4a48e748938a5a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/d4rl_reader.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/input_reader.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/input_reader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7769c87788d41c1075fc8063ef71dab91397f3a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/input_reader.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/resource.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/resource.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df7df56ef78ef608fe43ed4c4912a019bfe56a2a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/resource.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/shuffled_input.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/shuffled_input.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd269900e67c8d76b975ba8834a9fe7d7d33fd37
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/__pycache__/shuffled_input.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..74131faf3eb6d98c81ea97aa064bc991c01afe96
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__init__.py
@@ -0,0 +1,15 @@
+from ray.rllib.offline.estimators.importance_sampling import ImportanceSampling
+from ray.rllib.offline.estimators.weighted_importance_sampling import (
+    WeightedImportanceSampling,
+)
+from ray.rllib.offline.estimators.direct_method import DirectMethod
+from ray.rllib.offline.estimators.doubly_robust import DoublyRobust
+from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
+
+__all__ = [
+    "OffPolicyEstimator",
+    "ImportanceSampling",
+    "WeightedImportanceSampling",
+    "DirectMethod",
+    "DoublyRobust",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/doubly_robust.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/doubly_robust.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b9fad3da5c1d272f74dbb85195a700c8e15fe4e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/doubly_robust.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/feature_importance.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/feature_importance.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cebb4b9951b3bcb1c53df636016cb1a057e889c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/feature_importance.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/importance_sampling.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/importance_sampling.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..140e177e18fea63260f0d135c8f74d2205100c43
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/importance_sampling.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/weighted_importance_sampling.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/weighted_importance_sampling.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..905ea71009e637691666e0a28c7d08048492af62
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/__pycache__/weighted_importance_sampling.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..c735b93a5e1b23b5217e8d2f1eec39c58ef4c2c2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/direct_method.py
@@ -0,0 +1,180 @@
+import logging
+from typing import Dict, Any, Optional, List
+import math
+import numpy as np
+
+from ray.data import Dataset
+
+from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
+from ray.rllib.offline.offline_evaluation_utils import compute_q_and_v_values
+from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+from ray.rllib.offline.estimators.fqe_torch_model import FQETorchModel
+from ray.rllib.policy import Policy
+from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import DeveloperAPI, override
+from ray.rllib.utils.typing import SampleBatchType
+from ray.rllib.utils.numpy import convert_to_numpy
+
+logger = logging.getLogger()
+
+
+@DeveloperAPI
+class DirectMethod(OffPolicyEstimator):
+    r"""The Direct Method estimator.
+
+    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.
+
+    This method trains a Q-model for the evaluation policy \pi_e on behavior
+    data generated by \pi_b. Currently, RLlib implements this using
+    Fitted-Q Evaluation (FQE). You can also implement your own model
+    and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`.
+
+    This estimator computes the expected return for \pi_e for an episode as:
+    V^{\pi_e}(s_0) = \sum_{a \in A} \pi_e(a | s_0) Q(s_0, a)
+    and returns the mean and standard deviation over episodes.
+
+    For more information refer to https://arxiv.org/pdf/1911.06854.pdf"""
+
+    @override(OffPolicyEstimator)
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float,
+        epsilon_greedy: float = 0.0,
+        q_model_config: Optional[Dict] = None,
+    ):
+        """Initializes a Direct Method OPE Estimator.
+
+        Args:
+            policy: Policy to evaluate.
+            gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+                policy during deployment. With 1-epsilon_greedy we act according the
+                target policy.
+            q_model_config: Arguments to specify the Q-model. Must specify
+                a `type` key pointing to the Q-model class.
+                This Q-model is trained in the train() method and is used
+                to compute the state-value estimates for the DirectMethod estimator.
+                It must implement `train` and `estimate_v`.
+                TODO (Rohan138): Unify this with RLModule API.
+        """
+
+        super().__init__(policy, gamma, epsilon_greedy)
+
+        # Some dummy policies and ones that are not based on a tensor framework
+        # backend can come without a config or without a framework key.
+        if hasattr(policy, "config"):
+            assert (
+                policy.config.get("framework", "torch") == "torch"
+            ), "Framework must be torch to use DirectMethod."
+
+        q_model_config = q_model_config or {}
+        model_cls = q_model_config.pop("type", FQETorchModel)
+        self.model = model_cls(
+            policy=policy,
+            gamma=gamma,
+            **q_model_config,
+        )
+        assert hasattr(
+            self.model, "estimate_v"
+        ), "self.model must implement `estimate_v`!"
+
+    @override(OffPolicyEstimator)
+    def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]:
+        estimates_per_epsiode = {}
+        rewards = episode["rewards"]
+
+        v_behavior = 0.0
+        for t in range(episode.count):
+            v_behavior += rewards[t] * self.gamma**t
+
+        v_target = self._compute_v_target(episode[:1])
+
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+
+        return estimates_per_epsiode
+
+    @override(OffPolicyEstimator)
+    def estimate_on_single_step_samples(
+        self, batch: SampleBatch
+    ) -> Dict[str, List[float]]:
+        estimates_per_epsiode = {}
+        rewards = batch["rewards"]
+
+        v_behavior = rewards
+        v_target = self._compute_v_target(batch)
+
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+
+        return estimates_per_epsiode
+
+    def _compute_v_target(self, init_step):
+        v_target = self.model.estimate_v(init_step)
+        v_target = convert_to_numpy(v_target)
+        return v_target
+
+    @override(OffPolicyEstimator)
+    def train(self, batch: SampleBatchType) -> Dict[str, Any]:
+        """Trains self.model on the given batch.
+
+        Args:
+            batch: A SampleBatchType to train on
+
+        Returns:
+            A dict with key "loss" and value as the mean training loss.
+        """
+        batch = convert_ma_batch_to_sample_batch(batch)
+        losses = self.model.train(batch)
+        return {"loss": np.mean(losses)}
+
+    @override(OfflineEvaluator)
+    def estimate_on_dataset(
+        self, dataset: Dataset, *, n_parallelism: int = ...
+    ) -> Dict[str, Any]:
+        """Calculates the Direct Method estimate on the given dataset.
+
+        Note: This estimate works for only discrete action spaces for now.
+
+        Args:
+            dataset: Dataset to compute the estimate on. Each record in dataset should
+                include the following columns: `obs`, `actions`, `action_prob` and
+                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
+            n_parallelism: The number of parallel workers to use.
+
+        Returns:
+            Dictionary with the following keys:
+                v_target: The estimated value of the target policy.
+                v_behavior: The estimated value of the behavior policy.
+                v_gain: The estimated gain of the target policy over the behavior
+                    policy.
+                v_std: The standard deviation of the estimated value of the target.
+        """
+        # compute v_values
+        batch_size = max(dataset.count() // n_parallelism, 1)
+        updated_ds = dataset.map_batches(
+            compute_q_and_v_values,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={
+                "model_class": self.model.__class__,
+                "model_state": self.model.get_state(),
+                "compute_q_values": False,
+            },
+        )
+
+        v_behavior = updated_ds.mean("rewards")
+        v_target = updated_ds.mean("v_values")
+        v_gain_mean = v_target / v_behavior
+        v_gain_ste = (
+            updated_ds.std("v_values") / v_behavior / math.sqrt(dataset.count())
+        )
+
+        return {
+            "v_behavior": v_behavior,
+            "v_target": v_target,
+            "v_gain_mean": v_gain_mean,
+            "v_gain_ste": v_gain_ste,
+        }
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.py
new file mode 100644
index 0000000000000000000000000000000000000000..d98028023660612e329dc1b555a2ba8151078bc2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/doubly_robust.py
@@ -0,0 +1,253 @@
+import logging
+import numpy as np
+import math
+import pandas as pd
+
+from typing import Dict, Any, Optional, List
+
+from ray.data import Dataset
+
+from ray.rllib.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch, convert_ma_batch_to_sample_batch
+from ray.rllib.utils.annotations import DeveloperAPI, override
+from ray.rllib.utils.typing import SampleBatchType
+from ray.rllib.utils.numpy import convert_to_numpy
+
+from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
+from ray.rllib.offline.estimators.fqe_torch_model import FQETorchModel
+from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+from ray.rllib.offline.offline_evaluation_utils import (
+    compute_is_weights,
+    compute_q_and_v_values,
+)
+
+logger = logging.getLogger()
+
+
+@DeveloperAPI
+class DoublyRobust(OffPolicyEstimator):
+    """The Doubly Robust estimator.
+
+    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.
+
+    This method trains a Q-model for the evaluation policy \pi_e on behavior
+    data generated by \pi_b. Currently, RLlib implements this using
+    Fitted-Q Evaluation (FQE). You can also implement your own model
+    and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`.
+
+    For behavior policy \pi_b and evaluation policy \pi_e, define the
+    cumulative importance ratio at timestep t as:
+    p_t = \sum_{t'=0}^t (\pi_e(a_{t'} | s_{t'}) / \pi_b(a_{t'} | s_{t'})).
+
+    Consider an episode with length T. Let V_T = 0.
+    For all t in {0, T - 1}, use the following recursive update:
+    V_t^DR = (\sum_{a \in A} \pi_e(a | s_t) Q(s_t, a))
+        + p_t * (r_t + \gamma * V_{t+1}^DR - Q(s_t, a_t))
+
+    This estimator computes the expected return for \pi_e for an episode as:
+    V^{\pi_e}(s_0) = V_0^DR
+    and returns the mean and standard deviation over episodes.
+
+    For more information refer to https://arxiv.org/pdf/1911.06854.pdf"""
+
+    @override(OffPolicyEstimator)
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float,
+        epsilon_greedy: float = 0.0,
+        normalize_weights: bool = True,
+        q_model_config: Optional[Dict] = None,
+    ):
+        """Initializes a Doubly Robust OPE Estimator.
+
+        Args:
+            policy: Policy to evaluate.
+            gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+                policy during deployment. With 1-epsilon_greedy we act
+                according the target policy.
+            normalize_weights: If True, the inverse propensity scores are normalized to
+                their sum across the entire dataset. The effect of this is similar to
+                weighted importance sampling compared to standard importance sampling.
+            q_model_config: Arguments to specify the Q-model. Must specify
+                a `type` key pointing to the Q-model class.
+                This Q-model is trained in the train() method and is used
+                to compute the state-value and Q-value estimates
+                for the DoublyRobust estimator.
+                It must implement `train`, `estimate_q`, and `estimate_v`.
+                TODO (Rohan138): Unify this with RLModule API.
+        """
+
+        super().__init__(policy, gamma, epsilon_greedy)
+        q_model_config = q_model_config or {}
+        q_model_config["gamma"] = gamma
+
+        self._model_cls = q_model_config.pop("type", FQETorchModel)
+        self._model_configs = q_model_config
+        self._normalize_weights = normalize_weights
+
+        self.model = self._model_cls(
+            policy=policy,
+            **q_model_config,
+        )
+        assert hasattr(
+            self.model, "estimate_v"
+        ), "self.model must implement `estimate_v`!"
+        assert hasattr(
+            self.model, "estimate_q"
+        ), "self.model must implement `estimate_q`!"
+
+    @override(OffPolicyEstimator)
+    def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]:
+        estimates_per_epsiode = {}
+
+        rewards, old_prob = episode["rewards"], episode["action_prob"]
+        new_prob = self.compute_action_probs(episode)
+
+        weight = new_prob / old_prob
+
+        v_behavior = 0.0
+        v_target = 0.0
+        q_values = self.model.estimate_q(episode)
+        q_values = convert_to_numpy(q_values)
+        v_values = self.model.estimate_v(episode)
+        v_values = convert_to_numpy(v_values)
+        assert q_values.shape == v_values.shape == (episode.count,)
+
+        for t in reversed(range(episode.count)):
+            v_behavior = rewards[t] + self.gamma * v_behavior
+            v_target = v_values[t] + weight[t] * (
+                rewards[t] + self.gamma * v_target - q_values[t]
+            )
+        v_target = v_target.item()
+
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+
+        return estimates_per_epsiode
+
+    @override(OffPolicyEstimator)
+    def estimate_on_single_step_samples(
+        self, batch: SampleBatch
+    ) -> Dict[str, List[float]]:
+        estimates_per_epsiode = {}
+
+        rewards, old_prob = batch["rewards"], batch["action_prob"]
+        new_prob = self.compute_action_probs(batch)
+
+        q_values = self.model.estimate_q(batch)
+        q_values = convert_to_numpy(q_values)
+        v_values = self.model.estimate_v(batch)
+        v_values = convert_to_numpy(v_values)
+
+        v_behavior = rewards
+
+        weight = new_prob / old_prob
+        v_target = v_values + weight * (rewards - q_values)
+
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+
+        return estimates_per_epsiode
+
+    @override(OffPolicyEstimator)
+    def train(self, batch: SampleBatchType) -> Dict[str, Any]:
+        """Trains self.model on the given batch.
+
+        Args:
+        batch: A SampleBatch or MultiAgentbatch to train on
+
+        Returns:
+            A dict with key "loss" and value as the mean training loss.
+        """
+        batch = convert_ma_batch_to_sample_batch(batch)
+        losses = self.model.train(batch)
+        return {"loss": np.mean(losses)}
+
+    @override(OfflineEvaluator)
+    def estimate_on_dataset(
+        self, dataset: Dataset, *, n_parallelism: int = ...
+    ) -> Dict[str, Any]:
+        """Estimates the policy value using the Doubly Robust estimator.
+
+        The doubly robust estimator uses normalization of importance sampling weights
+        (aka. propensity ratios) to the average of the importance weights across the
+        entire dataset. This is done to reduce the variance of the estimate (similar to
+        weighted importance sampling). You can disable this by setting
+        `normalize_weights=False` in the constructor.
+
+        Note: This estimate works for only discrete action spaces for now.
+
+        Args:
+            dataset: Dataset to compute the estimate on. Each record in dataset should
+                include the following columns: `obs`, `actions`, `action_prob` and
+                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
+            n_parallelism: Number of parallelism to use for the computation.
+
+        Returns:
+            A dict with the following keys:
+                v_target: The estimated value of the target policy.
+                v_behavior: The estimated value of the behavior policy.
+                v_gain: The estimated gain of the target policy over the behavior
+                    policy.
+                v_std: The standard deviation of the estimated value of the target.
+        """
+
+        # step 1: compute the weights and weighted rewards
+        batch_size = max(dataset.count() // n_parallelism, 1)
+        updated_ds = dataset.map_batches(
+            compute_is_weights,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={
+                "policy_state": self.policy.get_state(),
+                "estimator_class": self.__class__,
+            },
+        )
+
+        # step 2: compute q_values and v_values
+        batch_size = max(updated_ds.count() // n_parallelism, 1)
+        updated_ds = updated_ds.map_batches(
+            compute_q_and_v_values,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={
+                "model_class": self.model.__class__,
+                "model_state": self.model.get_state(),
+            },
+        )
+
+        # step 3: compute the v_target
+        def compute_v_target(batch: pd.DataFrame, normalizer: float = 1.0):
+            weights = batch["weights"] / normalizer
+            batch["v_target"] = batch["v_values"] + weights * (
+                batch["rewards"] - batch["q_values"]
+            )
+            batch["v_behavior"] = batch["rewards"]
+            return batch
+
+        normalizer = updated_ds.mean("weights") if self._normalize_weights else 1.0
+        updated_ds = updated_ds.map_batches(
+            compute_v_target,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={"normalizer": normalizer},
+        )
+
+        v_behavior = updated_ds.mean("v_behavior")
+        v_target = updated_ds.mean("v_target")
+        v_gain_mean = v_target / v_behavior
+        v_gain_ste = (
+            updated_ds.std("v_target")
+            / normalizer
+            / v_behavior
+            / math.sqrt(dataset.count())
+        )
+
+        return {
+            "v_behavior": v_behavior,
+            "v_target": v_target,
+            "v_gain_mean": v_gain_mean,
+            "v_gain_ste": v_gain_ste,
+        }
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/fqe_torch_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/fqe_torch_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f071640a9afd380ad30f549113085f579cd86fa9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/fqe_torch_model.py
@@ -0,0 +1,297 @@
+from typing import Dict, Any
+from ray.rllib.models.utils import get_initializer
+from ray.rllib.policy import Policy
+
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.annotations import is_overridden
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+from gymnasium.spaces import Discrete
+
+torch, nn = try_import_torch()
+
+# TODO: Create a config object for FQE and unify it with the RLModule API
+
+
+@DeveloperAPI
+class FQETorchModel:
+    """Pytorch implementation of the Fitted Q-Evaluation (FQE) model from
+    https://arxiv.org/abs/1911.06854
+    """
+
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float,
+        model_config: ModelConfigDict = None,
+        n_iters: int = 1,
+        lr: float = 1e-3,
+        min_loss_threshold: float = 1e-4,
+        clip_grad_norm: float = 100.0,
+        minibatch_size: int = None,
+        polyak_coef: float = 1.0,
+    ) -> None:
+        """
+        Args:
+            policy: Policy to evaluate.
+            gamma: Discount factor of the environment.
+            model_config: The ModelConfigDict for self.q_model, defaults to:
+                {
+                    "fcnet_hiddens": [8, 8],
+                    "fcnet_activation": "relu",
+                    "vf_share_layers": True,
+                },
+            n_iters: Number of gradient steps to run on batch, defaults to 1
+            lr: Learning rate for Adam optimizer
+            min_loss_threshold: Early stopping if mean loss < min_loss_threshold
+            clip_grad_norm: Clip loss gradients to this maximum value
+            minibatch_size: Minibatch size for training Q-function;
+                if None, train on the whole batch
+            polyak_coef: Polyak averaging factor for target Q-function
+        """
+        self.policy = policy
+        assert isinstance(
+            policy.action_space, Discrete
+        ), f"{self.__class__.__name__} only supports discrete action spaces!"
+        self.gamma = gamma
+        self.observation_space = policy.observation_space
+        self.action_space = policy.action_space
+
+        if model_config is None:
+            model_config = {
+                "fcnet_hiddens": [32, 32, 32],
+                "fcnet_activation": "relu",
+                "vf_share_layers": True,
+            }
+        self.model_config = model_config
+
+        self.device = self.policy.device
+        self.q_model: TorchModelV2 = ModelCatalog.get_model_v2(
+            self.observation_space,
+            self.action_space,
+            self.action_space.n,
+            model_config,
+            framework="torch",
+            name="TorchQModel",
+        ).to(self.device)
+
+        self.target_q_model: TorchModelV2 = ModelCatalog.get_model_v2(
+            self.observation_space,
+            self.action_space,
+            self.action_space.n,
+            model_config,
+            framework="torch",
+            name="TargetTorchQModel",
+        ).to(self.device)
+
+        self.n_iters = n_iters
+        self.lr = lr
+        self.min_loss_threshold = min_loss_threshold
+        self.clip_grad_norm = clip_grad_norm
+        self.minibatch_size = minibatch_size
+        self.polyak_coef = polyak_coef
+        self.optimizer = torch.optim.Adam(self.q_model.variables(), self.lr)
+        initializer = get_initializer("xavier_uniform", framework="torch")
+        # Hard update target
+        self.update_target(polyak_coef=1.0)
+
+        def f(m):
+            if isinstance(m, nn.Linear):
+                initializer(m.weight)
+
+        self.initializer = f
+
+    def train(self, batch: SampleBatch) -> TensorType:
+        """Trains self.q_model using FQE loss on given batch.
+
+        Args:
+            batch: A SampleBatch of episodes to train on
+
+        Returns:
+            A list of losses for each training iteration
+        """
+        losses = []
+        minibatch_size = self.minibatch_size or batch.count
+        # Copy batch for shuffling
+        batch = batch.copy(shallow=True)
+        for _ in range(self.n_iters):
+            minibatch_losses = []
+            batch.shuffle()
+            for idx in range(0, batch.count, minibatch_size):
+                minibatch = batch[idx : idx + minibatch_size]
+                obs = torch.tensor(minibatch[SampleBatch.OBS], device=self.device)
+                actions = torch.tensor(
+                    minibatch[SampleBatch.ACTIONS],
+                    device=self.device,
+                    dtype=int,
+                )
+                rewards = torch.tensor(
+                    minibatch[SampleBatch.REWARDS], device=self.device
+                )
+                next_obs = torch.tensor(
+                    minibatch[SampleBatch.NEXT_OBS], device=self.device
+                )
+                dones = torch.tensor(
+                    minibatch[SampleBatch.TERMINATEDS], device=self.device, dtype=float
+                )
+
+                # Compute Q-values for current obs
+                q_values, _ = self.q_model({"obs": obs}, [], None)
+                q_acts = torch.gather(q_values, -1, actions.unsqueeze(-1)).squeeze(-1)
+
+                next_action_probs = self._compute_action_probs(next_obs)
+
+                # Compute Q-values for next obs
+                with torch.no_grad():
+                    next_q_values, _ = self.target_q_model({"obs": next_obs}, [], None)
+
+                # Compute estimated state value next_v = E_{a ~ pi(s)} [Q(next_obs,a)]
+                next_v = torch.sum(next_q_values * next_action_probs, axis=-1)
+                targets = rewards + (1 - dones) * self.gamma * next_v
+                loss = (targets - q_acts) ** 2
+                loss = torch.mean(loss)
+                self.optimizer.zero_grad()
+                loss.backward()
+                nn.utils.clip_grad.clip_grad_norm_(
+                    self.q_model.variables(), self.clip_grad_norm
+                )
+                self.optimizer.step()
+                minibatch_losses.append(loss.item())
+            iter_loss = sum(minibatch_losses) / len(minibatch_losses)
+            losses.append(iter_loss)
+            if iter_loss < self.min_loss_threshold:
+                break
+            self.update_target()
+        return losses
+
+    def estimate_q(self, batch: SampleBatch) -> TensorType:
+        obs = torch.tensor(batch[SampleBatch.OBS], device=self.device)
+        with torch.no_grad():
+            q_values, _ = self.q_model({"obs": obs}, [], None)
+        actions = torch.tensor(
+            batch[SampleBatch.ACTIONS], device=self.device, dtype=int
+        )
+        q_values = torch.gather(q_values, -1, actions.unsqueeze(-1)).squeeze(-1)
+        return q_values
+
+    def estimate_v(self, batch: SampleBatch) -> TensorType:
+        obs = torch.tensor(batch[SampleBatch.OBS], device=self.device)
+        with torch.no_grad():
+            q_values, _ = self.q_model({"obs": obs}, [], None)
+        # Compute pi(a | s) for each action a in policy.action_space
+        action_probs = self._compute_action_probs(obs)
+        v_values = torch.sum(q_values * action_probs, axis=-1)
+        return v_values
+
+    def update_target(self, polyak_coef=None):
+        # Update_target will be called periodically to copy Q network to
+        # target Q network, using (soft) polyak_coef-synching.
+        polyak_coef = polyak_coef or self.polyak_coef
+        model_state_dict = self.q_model.state_dict()
+        # Support partial (soft) synching.
+        # If polyak_coef == 1.0: Full sync from Q-model to target Q-model.
+        target_state_dict = self.target_q_model.state_dict()
+        model_state_dict = {
+            k: polyak_coef * model_state_dict[k] + (1 - polyak_coef) * v
+            for k, v in target_state_dict.items()
+        }
+
+        self.target_q_model.load_state_dict(model_state_dict)
+
+    def _compute_action_probs(self, obs: TensorType) -> TensorType:
+        """Compute action distribution over the action space.
+
+        Args:
+            obs: A tensor of observations of shape (batch_size * obs_dim)
+
+        Returns:
+            action_probs: A tensor of action probabilities
+            of shape (batch_size * action_dim)
+        """
+        input_dict = {SampleBatch.OBS: obs}
+        seq_lens = torch.ones(len(obs), device=self.device, dtype=int)
+        state_batches = []
+        if is_overridden(self.policy.action_distribution_fn):
+            try:
+                # TorchPolicyV2 function signature
+                dist_inputs, dist_class, _ = self.policy.action_distribution_fn(
+                    self.policy.model,
+                    obs_batch=input_dict,
+                    state_batches=state_batches,
+                    seq_lens=seq_lens,
+                    explore=False,
+                    is_training=False,
+                )
+            except TypeError:
+                # TorchPolicyV1 function signature for compatibility with DQN
+                # TODO: Remove this once DQNTorchPolicy is migrated to PolicyV2
+                dist_inputs, dist_class, _ = self.policy.action_distribution_fn(
+                    self.policy,
+                    self.policy.model,
+                    input_dict=input_dict,
+                    state_batches=state_batches,
+                    seq_lens=seq_lens,
+                    explore=False,
+                    is_training=False,
+                )
+        else:
+            dist_class = self.policy.dist_class
+            dist_inputs, _ = self.policy.model(input_dict, state_batches, seq_lens)
+        action_dist = dist_class(dist_inputs, self.policy.model)
+        assert isinstance(
+            action_dist.dist, torch.distributions.categorical.Categorical
+        ), "FQE only supports Categorical or MultiCategorical distributions!"
+        action_probs = action_dist.dist.probs
+        return action_probs
+
+    def get_state(self) -> Dict[str, Any]:
+        """Returns the current state of the FQE Model."""
+        return {
+            "policy_state": self.policy.get_state(),
+            "model_config": self.model_config,
+            "n_iters": self.n_iters,
+            "lr": self.lr,
+            "min_loss_threshold": self.min_loss_threshold,
+            "clip_grad_norm": self.clip_grad_norm,
+            "minibatch_size": self.minibatch_size,
+            "polyak_coef": self.polyak_coef,
+            "gamma": self.gamma,
+            "q_model_state": self.q_model.state_dict(),
+            "target_q_model_state": self.target_q_model.state_dict(),
+        }
+
+    def set_state(self, state: Dict[str, Any]) -> None:
+        """Sets the current state of the FQE Model.
+        Args:
+            state: A state dict returned by `get_state()`.
+        """
+        self.n_iters = state["n_iters"]
+        self.lr = state["lr"]
+        self.min_loss_threshold = state["min_loss_threshold"]
+        self.clip_grad_norm = state["clip_grad_norm"]
+        self.minibatch_size = state["minibatch_size"]
+        self.polyak_coef = state["polyak_coef"]
+        self.gamma = state["gamma"]
+        self.policy.set_state(state["policy_state"])
+        self.q_model.load_state_dict(state["q_model_state"])
+        self.target_q_model.load_state_dict(state["target_q_model_state"])
+
+    @classmethod
+    def from_state(cls, state: Dict[str, Any]) -> "FQETorchModel":
+        """Creates a FQE Model from a state dict.
+
+        Args:
+            state: A state dict returned by `get_state`.
+
+        Returns:
+            An instance of the FQETorchModel.
+        """
+        policy = Policy.from_state(state["policy_state"])
+        model = cls(
+            policy=policy, gamma=state["gamma"], model_config=state["model_config"]
+        )
+        model.set_state(state)
+        return model
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/importance_sampling.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/importance_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..630859820948b7f1139ab27649f21096ce2f28e9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/importance_sampling.py
@@ -0,0 +1,126 @@
+from typing import Dict, List, Any
+import math
+
+from ray.data import Dataset
+
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+from ray.rllib.offline.offline_evaluation_utils import (
+    remove_time_dim,
+    compute_is_weights,
+)
+from ray.rllib.offline.estimators.off_policy_estimator import OffPolicyEstimator
+from ray.rllib.policy.sample_batch import SampleBatch
+
+
+@DeveloperAPI
+class ImportanceSampling(OffPolicyEstimator):
+    r"""The step-wise IS estimator.
+
+    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.
+
+    For behavior policy \pi_b and evaluation policy \pi_e, define the
+    cumulative importance ratio at timestep t as:
+    p_t = \sum_{t'=0}^t (\pi_e(a_{t'} | s_{t'}) / \pi_b(a_{t'} | s_{t'})).
+
+    This estimator computes the expected return for \pi_e for an episode as:
+    V^{\pi_e}(s_0) = \sum_t \gamma ^ {t} * p_t * r_t
+    and returns the mean and standard deviation over episodes.
+
+    For more information refer to https://arxiv.org/pdf/1911.06854.pdf"""
+
+    @override(OffPolicyEstimator)
+    def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, float]:
+        estimates_per_epsiode = {}
+
+        rewards, old_prob = episode["rewards"], episode["action_prob"]
+        new_prob = self.compute_action_probs(episode)
+
+        # calculate importance ratios
+        p = []
+        for t in range(episode.count):
+            if t == 0:
+                pt_prev = 1.0
+            else:
+                pt_prev = p[t - 1]
+            p.append(pt_prev * new_prob[t] / old_prob[t])
+
+        # calculate stepwise IS estimate
+        v_behavior = 0.0
+        v_target = 0.0
+        for t in range(episode.count):
+            v_behavior += rewards[t] * self.gamma**t
+            v_target += p[t] * rewards[t] * self.gamma**t
+
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+
+        return estimates_per_epsiode
+
+    @override(OffPolicyEstimator)
+    def estimate_on_single_step_samples(
+        self, batch: SampleBatch
+    ) -> Dict[str, List[float]]:
+        estimates_per_epsiode = {}
+
+        rewards, old_prob = batch["rewards"], batch["action_prob"]
+        new_prob = self.compute_action_probs(batch)
+
+        weights = new_prob / old_prob
+        v_behavior = rewards
+        v_target = weights * rewards
+
+        estimates_per_epsiode["v_behavior"] = v_behavior
+        estimates_per_epsiode["v_target"] = v_target
+
+        return estimates_per_epsiode
+
+    @override(OfflineEvaluator)
+    def estimate_on_dataset(
+        self, dataset: Dataset, *, n_parallelism: int = ...
+    ) -> Dict[str, Any]:
+        """Computes the Importance sampling estimate on the given dataset.
+
+        Note: This estimate works for both continuous and discrete action spaces.
+
+        Args:
+            dataset: Dataset to compute the estimate on. Each record in dataset should
+                include the following columns: `obs`, `actions`, `action_prob` and
+                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
+            n_parallelism: The number of parallel workers to use.
+
+        Returns:
+            A dictionary containing the following keys:
+                v_target: The estimated value of the target policy.
+                v_behavior: The estimated value of the behavior policy.
+                v_gain_mean: The mean of the gain of the target policy over the
+                    behavior policy.
+                v_gain_ste: The standard error of the gain of the target policy over
+                    the behavior policy.
+        """
+        batch_size = max(dataset.count() // n_parallelism, 1)
+        dataset = dataset.map_batches(
+            remove_time_dim, batch_size=batch_size, batch_format="pandas"
+        )
+        updated_ds = dataset.map_batches(
+            compute_is_weights,
+            batch_size=batch_size,
+            batch_format="pandas",
+            fn_kwargs={
+                "policy_state": self.policy.get_state(),
+                "estimator_class": self.__class__,
+            },
+        )
+        v_target = updated_ds.mean("weighted_rewards")
+        v_behavior = updated_ds.mean("rewards")
+        v_gain_mean = v_target / v_behavior
+        v_gain_ste = (
+            updated_ds.std("weighted_rewards") / v_behavior / math.sqrt(dataset.count())
+        )
+
+        return {
+            "v_target": v_target,
+            "v_behavior": v_behavior,
+            "v_gain_mean": v_gain_mean,
+            "v_gain_ste": v_gain_ste,
+        }
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/off_policy_estimator.py b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/off_policy_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..0de4f246130ecee290b94e0c4ea5ea5ae6a6d59c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/offline/estimators/off_policy_estimator.py
@@ -0,0 +1,248 @@
+import gymnasium as gym
+import numpy as np
+import tree
+from typing import Dict, Any, List
+
+import logging
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy import Policy
+from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch
+from ray.rllib.utils.policy import compute_log_likelihoods_from_input_dict
+from ray.rllib.utils.annotations import (
+    DeveloperAPI,
+    ExperimentalAPI,
+    OverrideToImplementCustomLogic,
+)
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import TensorType, SampleBatchType
+from ray.rllib.offline.offline_evaluator import OfflineEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+@DeveloperAPI
+class OffPolicyEstimator(OfflineEvaluator):
+    """Interface for an off policy estimator for counterfactual evaluation."""
+
+    @DeveloperAPI
+    def __init__(
+        self,
+        policy: Policy,
+        gamma: float = 0.0,
+        epsilon_greedy: float = 0.0,
+    ):
+        """Initializes an OffPolicyEstimator instance.
+
+        Args:
+            policy: Policy to evaluate.
+            gamma: Discount factor of the environment.
+            epsilon_greedy: The probability by which we act acording to a fully random
+            policy during deployment. With 1-epsilon_greedy we act according the target
+            policy.
+            # TODO (kourosh): convert the input parameters to a config dict.
+        """
+        super().__init__(policy)
+        self.gamma = gamma
+        self.epsilon_greedy = epsilon_greedy
+
+    @DeveloperAPI
+    def estimate_on_single_episode(self, episode: SampleBatch) -> Dict[str, Any]:
+        """Returns off-policy estimates for the given one episode.
+
+        Args:
+            batch: The episode to calculate the off-policy estimates (OPE) on. The
+            episode must be a sample batch type that contains the fields "obs",
+            "actions", and "action_prob" and it needs to represent a
+            complete trajectory.
+
+        Returns:
+            The off-policy estimates (OPE) calculated on the given episode. The returned
+            dict can be any arbitrary mapping of strings to metrics.
+        """
+        raise NotImplementedError
+
+    @DeveloperAPI
+    def estimate_on_single_step_samples(
+        self,
+        batch: SampleBatch,
+    ) -> Dict[str, List[float]]:
+        """Returns off-policy estimates for the batch of single timesteps. This is
+        highly optimized for bandits assuming each episode is a single timestep.
+
+        Args:
+            batch: The batch to calculate the off-policy estimates (OPE) on. The
+            batch must be a sample batch type that contains the fields "obs",
+            "actions", and "action_prob".
+
+        Returns:
+            The off-policy estimates (OPE) calculated on the given batch of single time
+            step samples. The returned dict can be any arbitrary mapping of strings to
+            a list of floats capturing the values per each record.
+        """
+        raise NotImplementedError
+
+    def on_before_split_batch_by_episode(
+        self, sample_batch: SampleBatch
+    ) -> SampleBatch:
+        """Called before the batch is split by episode. You can perform any
+        preprocessing on the batch that you want here.
+        e.g. adding done flags to the batch, or reseting some stats that you want to
+        track per episode later during estimation, .etc.
+
+        Args:
+            sample_batch: The batch to split by episode. This contains multiple
+            episodes.
+
+        Returns:
+            The modified batch before calling split_by_episode().
+        """
+        return sample_batch
+
+    @OverrideToImplementCustomLogic
+    def on_after_split_batch_by_episode(
+        self, all_episodes: List[SampleBatch]
+    ) -> List[SampleBatch]:
+        """Called after the batch is split by episode. You can perform any
+        postprocessing on each episode that you want here.
+        e.g. computing advantage per episode, .etc.
+
+        Args:
+            all_episodes: The list of episodes in the original batch. Each element is a
+            sample batch type that is a single episode.
+        """
+
+        return all_episodes
+
+    @OverrideToImplementCustomLogic
+    def peek_on_single_episode(self, episode: SampleBatch) -> None:
+        """This is called on each episode before it is passed to
+        estimate_on_single_episode(). Using this method, you can get a peek at the
+        entire validation dataset before runnining the estimation. For examlpe if you
+        need to perform any normalizations of any sorts on the dataset, you can compute
+        the normalization parameters here.
+
+        Args:
+            episode: The episode that is split from the original batch. This is a
+            sample batch type that is a single episode.
+        """
+        pass
+
+    @DeveloperAPI
+    def estimate(
+        self, batch: SampleBatchType, split_batch_by_episode: bool = True
+    ) -> Dict[str, Any]:
+        """Compute off-policy estimates.
+
+        Args:
+            batch: The batch to calculate the off-policy estimates (OPE) on. The
+            batch must contain the fields "obs", "actions", and "action_prob".
+            split_batch_by_episode: Whether to split the batch by episode.
+
+        Returns:
+            The off-policy estimates (OPE) calculated on the given batch. The returned
+            dict can be any arbitrary mapping of strings to metrics.
+            The dict consists of the following metrics:
+            - v_behavior: The discounted return averaged over episodes in the batch
+            - v_behavior_std: The standard deviation corresponding to v_behavior
+            - v_target: The estimated discounted return for `self.policy`,
+            averaged over episodes in the batch
+            - v_target_std: The standard deviation corresponding to v_target
+            - v_gain: v_target / max(v_behavior, 1e-8)
+            - v_delta: The difference between v_target and v_behavior.
+        """
+        batch = convert_ma_batch_to_sample_batch(batch)
+        self.check_action_prob_in_batch(batch)
+        estimates_per_epsiode = []
+        if split_batch_by_episode:
+            batch = self.on_before_split_batch_by_episode(batch)
+            all_episodes = batch.split_by_episode()
+            all_episodes = self.on_after_split_batch_by_episode(all_episodes)
+            for episode in all_episodes:
+                assert len(set(episode[SampleBatch.EPS_ID])) == 1, (
+                    "The episode must contain only one episode id. For some reason "
+                    "the split_by_episode() method could not successfully split "
+                    "the batch by episodes. Each row in the dataset should be "
+                    "one episode. Check your evaluation dataset for errors."
+                )
+                self.peek_on_single_episode(episode)
+
+            for episode in all_episodes:
+                estimate_step_results = self.estimate_on_single_episode(episode)
+                estimates_per_epsiode.append(estimate_step_results)
+
+            # turn a list of identical dicts into a dict of lists
+            estimates_per_epsiode = tree.map_structure(
+                lambda *x: list(x), *estimates_per_epsiode
+            )
+        else:
+            # the returned dict is a mapping of strings to a list of floats
+            estimates_per_epsiode = self.estimate_on_single_step_samples(batch)
+
+        estimates = {
+            "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
+            "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
+            "v_target": np.mean(estimates_per_epsiode["v_target"]),
+            "v_target_std": np.std(estimates_per_epsiode["v_target"]),
+        }
+        estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
+        estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
+
+        return estimates
+
+    @DeveloperAPI
+    def check_action_prob_in_batch(self, batch: SampleBatchType) -> None:
+        """Checks if we support off policy estimation (OPE) on given batch.
+
+        Args:
+            batch: The batch to check.
+
+        Raises:
+            ValueError: In case `action_prob` key is not in batch
+        """
+
+        if "action_prob" not in batch:
+            raise ValueError(
+                "Off-policy estimation is not possible unless the inputs "
+                "include action probabilities (i.e., the policy is stochastic "
+                "and emits the 'action_prob' key). For DQN this means using "
+                "`exploration_config: {type: 'SoftQ'}`. You can also set "
+                "`off_policy_estimation_methods: {}` to disable estimation."
+            )
+
+    @ExperimentalAPI
+    def compute_action_probs(self, batch: SampleBatch):
+        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch)
+        new_prob = np.exp(convert_to_numpy(log_likelihoods))
+
+        if self.epsilon_greedy > 0.0:
+            if not isinstance(self.policy.action_space, gym.spaces.Discrete):
+                raise ValueError(
+                    "Evaluation with epsilon-greedy exploration is only supported "
+                    "with discrete action spaces."
+                )
+            eps = self.epsilon_greedy
+            new_prob = new_prob * (1 - eps) + eps / self.policy.action_space.n
+
+        return new_prob
+
+    @DeveloperAPI
+    def train(self, batch: SampleBatchType) -> Dict[str, Any]:
+        """Train a model for Off-Policy Estimation.
+
+        Args:
+            batch: SampleBatch to train on
+
+        Returns:
+            Any optional metrics to return from the estimator
+        """
+        return {}
+
+    @Deprecated(
+        old="OffPolicyEstimator.action_log_likelihood",
+        new="ray.rllib.utils.policy.compute_log_likelihoods_from_input_dict",
+        error=True,
+    )
+    def action_log_likelihood(self, batch: SampleBatchType) -> TensorType:
+        log_likelihoods = compute_log_likelihoods_from_input_dict(self.policy, batch)
+        return convert_to_numpy(log_likelihoods)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..01f8404da2f07ad128cfdb1f7efff6fcebc63e7b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__init__.py
@@ -0,0 +1,141 @@
+import contextlib
+from functools import partial
+
+from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.rllib.utils.filter import Filter
+from ray.rllib.utils.filter_manager import FilterManager
+from ray.rllib.utils.framework import (
+    try_import_jax,
+    try_import_tf,
+    try_import_tfp,
+    try_import_torch,
+)
+from ray.rllib.utils.numpy import (
+    sigmoid,
+    softmax,
+    relu,
+    one_hot,
+    fc,
+    lstm,
+    SMALL_NUMBER,
+    LARGE_INTEGER,
+    MIN_LOG_NN_OUTPUT,
+    MAX_LOG_NN_OUTPUT,
+)
+from ray.rllib.utils.schedules import (
+    LinearSchedule,
+    PiecewiseSchedule,
+    PolynomialSchedule,
+    ExponentialSchedule,
+    ConstantSchedule,
+)
+from ray.rllib.utils.test_utils import (
+    check,
+    check_compute_single_action,
+    check_train_results,
+)
+from ray.tune.utils import merge_dicts, deep_update
+
+
+@DeveloperAPI
+def add_mixins(base, mixins, reversed=False):
+    """Returns a new class with mixins applied in priority order."""
+
+    mixins = list(mixins or [])
+
+    while mixins:
+        if reversed:
+
+            class new_base(base, mixins.pop()):
+                pass
+
+        else:
+
+            class new_base(mixins.pop(), base):
+                pass
+
+        base = new_base
+
+    return base
+
+
+@DeveloperAPI
+def force_list(elements=None, to_tuple=False):
+    """
+    Makes sure `elements` is returned as a list, whether `elements` is a single
+    item, already a list, or a tuple.
+
+    Args:
+        elements (Optional[any]): The inputs as single item, list, or tuple to
+            be converted into a list/tuple. If None, returns empty list/tuple.
+        to_tuple: Whether to use tuple (instead of list).
+
+    Returns:
+        Union[list,tuple]: All given elements in a list/tuple depending on
+            `to_tuple`'s value. If elements is None,
+            returns an empty list/tuple.
+    """
+    ctor = list
+    if to_tuple is True:
+        ctor = tuple
+    return (
+        ctor()
+        if elements is None
+        else ctor(elements)
+        if type(elements) in [list, set, tuple]
+        else ctor([elements])
+    )
+
+
+@DeveloperAPI
+class NullContextManager(contextlib.AbstractContextManager):
+    """No-op context manager"""
+
+    def __init__(self):
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        pass
+
+
+force_tuple = partial(force_list, to_tuple=True)
+
+__all__ = [
+    "add_mixins",
+    "check",
+    "check_compute_single_action",
+    "check_train_results",
+    "deep_update",
+    "deprecation_warning",
+    "fc",
+    "force_list",
+    "force_tuple",
+    "lstm",
+    "merge_dicts",
+    "one_hot",
+    "override",
+    "relu",
+    "sigmoid",
+    "softmax",
+    "try_import_jax",
+    "try_import_tf",
+    "try_import_tfp",
+    "try_import_torch",
+    "ConstantSchedule",
+    "DeveloperAPI",
+    "ExponentialSchedule",
+    "Filter",
+    "FilterManager",
+    "LARGE_INTEGER",
+    "LinearSchedule",
+    "MAX_LOG_NN_OUTPUT",
+    "MIN_LOG_NN_OUTPUT",
+    "PiecewiseSchedule",
+    "PolynomialSchedule",
+    "PublicAPI",
+    "SMALL_NUMBER",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/checkpoints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/checkpoints.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16a6113923e5e7485700539b5855c8e137d4fc40
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/checkpoints.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/compression.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/compression.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf3824e39c119e0332a3c8a45cbe069cc9b35ed3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/compression.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/deprecation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/deprecation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4b8c6bb6d034408d63102dfcbfecdc710c4341c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/deprecation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/from_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/from_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54b78524e3a32e596d2d2fa82219a28c611c8e83
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/from_config.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/lambda_defaultdict.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/lambda_defaultdict.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98c7e899f1cb2f49c63c98c467148021f7873f5f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/lambda_defaultdict.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/memory.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/memory.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2274fbc35b85a2b14a85579ff1bcd2fc9460432
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/memory.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/serialization.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/serialization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0295a6f3b052269b3eb51ab7168996447a1b8871
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/serialization.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/torch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/torch_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7dcee799cb3463e244dbe77135932393a476342
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/__pycache__/torch_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/actors.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/actors.py
new file mode 100644
index 0000000000000000000000000000000000000000..d56dcdbd773f920e6b922eeeceb71e4eb663d68c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/actors.py
@@ -0,0 +1,258 @@
+from collections import defaultdict, deque
+import logging
+import platform
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Type
+
+import ray
+from ray.actor import ActorClass, ActorHandle
+
+logger = logging.getLogger(__name__)
+
+
+class TaskPool:
+    """Helper class for tracking the status of many in-flight actor tasks."""
+
+    def __init__(self):
+        self._tasks = {}
+        self._objects = {}
+        self._fetching = deque()
+
+    def add(self, worker, all_obj_refs):
+        if isinstance(all_obj_refs, list):
+            obj_ref = all_obj_refs[0]
+        else:
+            obj_ref = all_obj_refs
+        self._tasks[obj_ref] = worker
+        self._objects[obj_ref] = all_obj_refs
+
+    def completed(self, blocking_wait=False):
+        pending = list(self._tasks)
+        if pending:
+            ready, _ = ray.wait(pending, num_returns=len(pending), timeout=0)
+            if not ready and blocking_wait:
+                ready, _ = ray.wait(pending, num_returns=1, timeout=10.0)
+            for obj_ref in ready:
+                yield (self._tasks.pop(obj_ref), self._objects.pop(obj_ref))
+
+    def completed_prefetch(self, blocking_wait=False, max_yield=999):
+        """Similar to completed but only returns once the object is local.
+
+        Assumes obj_ref only is one id."""
+
+        for worker, obj_ref in self.completed(blocking_wait=blocking_wait):
+            self._fetching.append((worker, obj_ref))
+
+        for _ in range(max_yield):
+            if not self._fetching:
+                break
+
+            yield self._fetching.popleft()
+
+    def reset_workers(self, workers):
+        """Notify that some workers may be removed."""
+        for obj_ref, ev in self._tasks.copy().items():
+            if ev not in workers:
+                del self._tasks[obj_ref]
+                del self._objects[obj_ref]
+
+        # We want to keep the same deque reference so that we don't suffer from
+        # stale references in generators that are still in flight
+        for _ in range(len(self._fetching)):
+            ev, obj_ref = self._fetching.popleft()
+            if ev in workers:
+                # Re-queue items that are still valid
+                self._fetching.append((ev, obj_ref))
+
+    @property
+    def count(self):
+        return len(self._tasks)
+
+
+def create_colocated_actors(
+    actor_specs: Sequence[Tuple[Type, Any, Any, int]],
+    node: Optional[str] = "localhost",
+    max_attempts: int = 10,
+) -> Dict[Type, List[ActorHandle]]:
+    """Create co-located actors of any type(s) on any node.
+
+    Args:
+        actor_specs: Tuple/list with tuples consisting of: 1) The
+            (already @ray.remote) class(es) to construct, 2) c'tor args,
+            3) c'tor kwargs, and 4) the number of actors of that class with
+            given args/kwargs to construct.
+        node: The node to co-locate the actors on. By default ("localhost"),
+            place the actors on the node the caller of this function is
+            located on. Use None for indicating that any (resource fulfilling)
+            node in the cluster may be used.
+        max_attempts: The maximum number of co-location attempts to
+            perform before throwing an error.
+
+    Returns:
+        A dict mapping the created types to the list of n ActorHandles
+        created (and co-located) for that type.
+    """
+    if node == "localhost":
+        node = platform.node()
+
+    # Maps each entry in `actor_specs` to lists of already co-located actors.
+    ok = [[] for _ in range(len(actor_specs))]
+
+    # Try n times to co-locate all given actor types (`actor_specs`).
+    # With each (failed) attempt, increase the number of actors we try to
+    # create (on the same node), then kill the ones that have been created in
+    # excess.
+    for attempt in range(max_attempts):
+        # If any attempt to co-locate fails, set this to False and we'll do
+        # another attempt.
+        all_good = True
+        # Process all `actor_specs` in sequence.
+        for i, (typ, args, kwargs, count) in enumerate(actor_specs):
+            args = args or []  # Allow None.
+            kwargs = kwargs or {}  # Allow None.
+            # We don't have enough actors yet of this spec co-located on
+            # the desired node.
+            if len(ok[i]) < count:
+                co_located = try_create_colocated(
+                    cls=typ,
+                    args=args,
+                    kwargs=kwargs,
+                    count=count * (attempt + 1),
+                    node=node,
+                )
+                # If node did not matter (None), from here on, use the host
+                # that the first actor(s) are already co-located on.
+                if node is None:
+                    node = ray.get(co_located[0].get_host.remote())
+                # Add the newly co-located actors to the `ok` list.
+                ok[i].extend(co_located)
+                # If we still don't have enough -> We'll have to do another
+                # attempt.
+                if len(ok[i]) < count:
+                    all_good = False
+            # We created too many actors for this spec -> Kill/truncate
+            # the excess ones.
+            if len(ok[i]) > count:
+                for a in ok[i][count:]:
+                    a.__ray_terminate__.remote()
+                ok[i] = ok[i][:count]
+
+        # All `actor_specs` have been fulfilled, return lists of
+        # co-located actors.
+        if all_good:
+            return ok
+
+    raise Exception("Unable to create enough colocated actors -> aborting.")
+
+
+def try_create_colocated(
+    cls: Type[ActorClass],
+    args: List[Any],
+    count: int,
+    kwargs: Optional[List[Any]] = None,
+    node: Optional[str] = "localhost",
+) -> List[ActorHandle]:
+    """Tries to co-locate (same node) a set of Actors of the same type.
+
+    Returns a list of successfully co-located actors. All actors that could
+    not be co-located (with the others on the given node) will not be in this
+    list.
+
+    Creates each actor via it's remote() constructor and then checks, whether
+    it has been co-located (on the same node) with the other (already created)
+    ones. If not, terminates the just created actor.
+
+    Args:
+        cls: The Actor class to use (already @ray.remote "converted").
+        args: List of args to pass to the Actor's constructor. One item
+            per to-be-created actor (`count`).
+        count: Number of actors of the given `cls` to construct.
+        kwargs: Optional list of kwargs to pass to the Actor's constructor.
+            One item per to-be-created actor (`count`).
+        node: The node to co-locate the actors on. By default ("localhost"),
+            place the actors on the node the caller of this function is
+            located on. If None, will try to co-locate all actors on
+            any available node.
+
+    Returns:
+        List containing all successfully co-located actor handles.
+    """
+    if node == "localhost":
+        node = platform.node()
+
+    kwargs = kwargs or {}
+    actors = [cls.remote(*args, **kwargs) for _ in range(count)]
+    co_located, non_co_located = split_colocated(actors, node=node)
+    logger.info("Got {} colocated actors of {}".format(len(co_located), count))
+    for a in non_co_located:
+        a.__ray_terminate__.remote()
+    return co_located
+
+
+def split_colocated(
+    actors: List[ActorHandle],
+    node: Optional[str] = "localhost",
+) -> Tuple[List[ActorHandle], List[ActorHandle]]:
+    """Splits up given actors into colocated (on same node) and non colocated.
+
+    The co-location criterion depends on the `node` given:
+    If given (or default: platform.node()): Consider all actors that are on
+    that node "colocated".
+    If None: Consider the largest sub-set of actors that are all located on
+    the same node (whatever that node is) as "colocated".
+
+    Args:
+        actors: The list of actor handles to split into "colocated" and
+            "non colocated".
+        node: The node defining "colocation" criterion. If provided, consider
+            thos actors "colocated" that sit on this node. If None, use the
+            largest subset within `actors` that are sitting on the same
+            (any) node.
+
+    Returns:
+        Tuple of two lists: 1) Co-located ActorHandles, 2) non co-located
+        ActorHandles.
+    """
+    if node == "localhost":
+        node = platform.node()
+
+    # Get nodes of all created actors.
+    hosts = ray.get([a.get_host.remote() for a in actors])
+
+    # If `node` not provided, use the largest group of actors that sit on the
+    # same node, regardless of what that node is.
+    if node is None:
+        node_groups = defaultdict(set)
+        for host, actor in zip(hosts, actors):
+            node_groups[host].add(actor)
+        max_ = -1
+        largest_group = None
+        for host in node_groups:
+            if max_ < len(node_groups[host]):
+                max_ = len(node_groups[host])
+                largest_group = host
+        non_co_located = []
+        for host in node_groups:
+            if host != largest_group:
+                non_co_located.extend(list(node_groups[host]))
+        return list(node_groups[largest_group]), non_co_located
+    # Node provided (or default: localhost): Consider those actors "colocated"
+    # that were placed on `node`.
+    else:
+        # Split into co-located (on `node) and non-co-located (not on `node`).
+        co_located = []
+        non_co_located = []
+        for host, a in zip(hosts, actors):
+            # This actor has been placed on the correct node.
+            if host == node:
+                co_located.append(a)
+            # This actor has been placed on a different node.
+            else:
+                non_co_located.append(a)
+        return co_located, non_co_located
+
+
+def drop_colocated(actors: List[ActorHandle]) -> List[ActorHandle]:
+    colocated, non_colocated = split_colocated(actors)
+    for a in colocated:
+        a.__ray_terminate__.remote()
+    return non_colocated
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/annotations.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/annotations.py
new file mode 100644
index 0000000000000000000000000000000000000000..6824412b354f1f18df9d7a663e99471835680994
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/annotations.py
@@ -0,0 +1,213 @@
+from ray.rllib.utils.deprecation import Deprecated
+from ray.util.annotations import _mark_annotated
+
+
+def override(parent_cls):
+    """Decorator for documenting method overrides.
+
+    Args:
+        parent_cls: The superclass that provides the overridden method. If
+            `parent_class` does not actually have the method or the class, in which
+            method is defined is not a subclass of `parent_class`, an error is raised.
+
+    .. testcode::
+        :skipif: True
+
+        from ray.rllib.policy import Policy
+        class TorchPolicy(Policy):
+            ...
+            # Indicates that `TorchPolicy.loss()` overrides the parent
+            # Policy class' own `loss method. Leads to an error if Policy
+            # does not have a `loss` method.
+
+            @override(Policy)
+            def loss(self, model, action_dist, train_batch):
+                ...
+
+    """
+
+    class OverrideCheck:
+        def __init__(self, func, expected_parent_cls):
+            self.func = func
+            self.expected_parent_cls = expected_parent_cls
+
+        def __set_name__(self, owner, name):
+            # Check if the owner (the class) is a subclass of the expected base class
+            if not issubclass(owner, self.expected_parent_cls):
+                raise TypeError(
+                    f"When using the @override decorator, {owner.__name__} must be a "
+                    f"subclass of {parent_cls.__name__}!"
+                )
+            # Set the function as a regular method on the class.
+            setattr(owner, name, self.func)
+
+    def decorator(method):
+        # Check, whether `method` is actually defined by the parent class.
+        if method.__name__ not in dir(parent_cls):
+            raise NameError(
+                f"When using the @override decorator, {method.__name__} must override "
+                f"the respective method (with the same name) of {parent_cls.__name__}!"
+            )
+
+        # Check if the class is a subclass of the expected base class
+        OverrideCheck(method, parent_cls)
+        return method
+
+    return decorator
+
+
+def PublicAPI(obj):
+    """Decorator for documenting public APIs.
+
+    Public APIs are classes and methods exposed to end users of RLlib. You
+    can expect these APIs to remain stable across RLlib releases.
+
+    Subclasses that inherit from a ``@PublicAPI`` base class can be
+    assumed part of the RLlib public API as well (e.g., all Algorithm classes
+    are in public API because Algorithm is ``@PublicAPI``).
+
+    In addition, you can assume all algo configurations are part of their
+    public API as well.
+
+    .. testcode::
+        :skipif: True
+
+        # Indicates that the `Algorithm` class is exposed to end users
+        # of RLlib and will remain stable across RLlib releases.
+        from ray import tune
+        @PublicAPI
+        class Algorithm(tune.Trainable):
+            ...
+    """
+
+    _mark_annotated(obj)
+    return obj
+
+
+def DeveloperAPI(obj):
+    """Decorator for documenting developer APIs.
+
+    Developer APIs are classes and methods explicitly exposed to developers
+    for the purposes of building custom algorithms or advanced training
+    strategies on top of RLlib internals. You can generally expect these APIs
+    to be stable sans minor changes (but less stable than public APIs).
+
+    Subclasses that inherit from a ``@DeveloperAPI`` base class can be
+    assumed part of the RLlib developer API as well.
+
+    .. testcode::
+        :skipif: True
+
+        # Indicates that the `TorchPolicy` class is exposed to end users
+        # of RLlib and will remain (relatively) stable across RLlib
+        # releases.
+        from ray.rllib.policy import Policy
+        @DeveloperAPI
+        class TorchPolicy(Policy):
+            ...
+    """
+
+    _mark_annotated(obj)
+    return obj
+
+
+def ExperimentalAPI(obj):
+    """Decorator for documenting experimental APIs.
+
+    Experimental APIs are classes and methods that are in development and may
+    change at any time in their development process. You should not expect
+    these APIs to be stable until their tag is changed to `DeveloperAPI` or
+    `PublicAPI`.
+
+    Subclasses that inherit from a ``@ExperimentalAPI`` base class can be
+    assumed experimental as well.
+
+    .. testcode::
+        :skipif: True
+
+        from ray.rllib.policy import Policy
+        class TorchPolicy(Policy):
+            ...
+            # Indicates that the `TorchPolicy.loss` method is a new and
+            # experimental API and may change frequently in future
+            # releases.
+            @ExperimentalAPI
+            def loss(self, model, action_dist, train_batch):
+                ...
+    """
+
+    _mark_annotated(obj)
+    return obj
+
+
+def OldAPIStack(obj):
+    """Decorator for classes/methods/functions belonging to the old API stack.
+
+    These should be deprecated at some point after Ray 3.0 (RLlib GA).
+    It is recommended for users to start exploring (and coding against) the new API
+    stack instead.
+    """
+    # No effect yet.
+
+    _mark_annotated(obj)
+    return obj
+
+
+def OverrideToImplementCustomLogic(obj):
+    """Users should override this in their sub-classes to implement custom logic.
+
+    Used in Algorithm and Policy to tag methods that need overriding, e.g.
+    `Policy.loss()`.
+
+    .. testcode::
+        :skipif: True
+
+        from ray.rllib.policy.torch_policy import TorchPolicy
+        @overrides(TorchPolicy)
+        @OverrideToImplementCustomLogic
+        def loss(self, ...):
+            # implement custom loss function here ...
+            # ... w/o calling the corresponding `super().loss()` method.
+            ...
+
+    """
+    obj.__is_overridden__ = False
+    return obj
+
+
+def OverrideToImplementCustomLogic_CallToSuperRecommended(obj):
+    """Users should override this in their sub-classes to implement custom logic.
+
+    Thereby, it is recommended (but not required) to call the super-class'
+    corresponding method.
+
+    Used in Algorithm and Policy to tag methods that need overriding, but the
+    super class' method should still be called, e.g.
+    `Algorithm.setup()`.
+
+    .. testcode::
+        :skipif: True
+
+        from ray import tune
+        @overrides(tune.Trainable)
+        @OverrideToImplementCustomLogic_CallToSuperRecommended
+        def setup(self, config):
+            # implement custom setup logic here ...
+            super().setup(config)
+            # ... or here (after having called super()'s setup method.
+    """
+    obj.__is_overridden__ = False
+    return obj
+
+
+def is_overridden(obj):
+    """Check whether a function has been overridden.
+
+    Note, this only works for API calls decorated with OverrideToImplementCustomLogic
+    or OverrideToImplementCustomLogic_CallToSuperRecommended.
+    """
+    return getattr(obj, "__is_overridden__", True)
+
+
+# Backward compatibility.
+Deprecated = Deprecated
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/checkpoints.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/checkpoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c8e9531fc34afbc57675f6d6f61bd137f9a5436
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/checkpoints.py
@@ -0,0 +1,1045 @@
+import abc
+import inspect
+import json
+import logging
+import os
+from packaging import version
+import pathlib
+import re
+import tempfile
+from types import MappingProxyType
+from typing import Any, Collection, Dict, List, Optional, Tuple, Union
+
+import pyarrow.fs
+
+import ray
+import ray.cloudpickle as pickle
+from ray.rllib.core import (
+    COMPONENT_LEARNER,
+    COMPONENT_LEARNER_GROUP,
+    COMPONENT_RL_MODULE,
+)
+from ray.rllib.utils import force_list
+from ray.rllib.utils.actor_manager import FaultTolerantActorManager
+from ray.rllib.utils.annotations import (
+    OldAPIStack,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.rllib.utils.serialization import NOT_SERIALIZABLE, serialize_type
+from ray.rllib.utils.typing import StateDict
+from ray.train import Checkpoint
+from ray.tune.utils.file_transfer import sync_dir_between_nodes
+from ray.util import log_once
+from ray.util.annotations import PublicAPI
+
+logger = logging.getLogger(__name__)
+
+# The current checkpoint version used by RLlib for Algorithm and Policy checkpoints.
+# History:
+# 0.1: Ray 2.0.0
+#  A single `checkpoint-[iter num]` file for Algorithm checkpoints
+#  within the checkpoint directory. Policy checkpoints not supported across all
+#  DL frameworks.
+
+# 1.0: Ray >=2.1.0
+#  An algorithm_state.pkl file for the state of the Algorithm (excluding
+#  individual policy states).
+#  One sub-dir inside the "policies" sub-dir for each policy with a
+#  dedicated policy_state.pkl in it for the policy state.
+
+# 1.1: Same as 1.0, but has a new "format" field in the rllib_checkpoint.json file
+# indicating, whether the checkpoint is `cloudpickle` (default) or `msgpack`.
+
+# 1.2: Introduces the checkpoint for the new Learner API if the Learner API is enabled.
+
+# 2.0: Introduces the Checkpointable API for all components on the new API stack
+# (if the Learner-, RLModule, EnvRunner, and ConnectorV2 APIs are enabled).
+
+CHECKPOINT_VERSION = version.Version("1.1")
+CHECKPOINT_VERSION_LEARNER_AND_ENV_RUNNER = version.Version("2.1")
+
+
+@PublicAPI(stability="alpha")
+class Checkpointable(abc.ABC):
+    """Abstract base class for a component of RLlib that can be checkpointed to disk.
+
+    Subclasses must implement the following APIs:
+    - save_to_path()
+    - restore_from_path()
+    - from_checkpoint()
+    - get_state()
+    - set_state()
+    - get_ctor_args_and_kwargs()
+    - get_metadata()
+    - get_checkpointable_components()
+    """
+
+    # The state file for the implementing class.
+    # This file contains any state information that does NOT belong to any subcomponent
+    # of the implementing class (which are `Checkpointable` themselves and thus should
+    # have their own state- and metadata files).
+    # After a `save_to_path([path])` this file can be found directly in: `path/`.
+    STATE_FILE_NAME = "state"
+
+    # The filename of the pickle file that contains the class information of the
+    # Checkpointable as well as all constructor args to be passed to such a class in
+    # order to construct a new instance.
+    CLASS_AND_CTOR_ARGS_FILE_NAME = "class_and_ctor_args.pkl"
+
+    # Subclasses may set this to their own metadata filename.
+    # The dict returned by self.get_metadata() is stored in this JSON file.
+    METADATA_FILE_NAME = "metadata.json"
+
+    def save_to_path(
+        self,
+        path: Optional[Union[str, pathlib.Path]] = None,
+        *,
+        state: Optional[StateDict] = None,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        use_msgpack: bool = False,
+    ) -> str:
+        """Saves the state of the implementing class (or `state`) to `path`.
+
+        The state of the implementing class is always saved in the following format:
+
+        .. testcode::
+            :skipif: True
+
+            path/
+                [component1]/
+                    [component1 subcomponentA]/
+                        ...
+                    [component1 subcomponentB]/
+                        ...
+                [component2]/
+                        ...
+                [cls.METADATA_FILE_NAME] (json)
+                [cls.STATE_FILE_NAME] (pkl|msgpack)
+
+        The main logic is to loop through all subcomponents of this Checkpointable
+        and call their respective `save_to_path` methods. Then save the remaining
+        (non subcomponent) state to this Checkpointable's STATE_FILE_NAME.
+        In the exception that a component is a FaultTolerantActorManager instance,
+        instead of calling `save_to_path` directly on that manager, the first healthy
+        actor is interpreted as the component and its `save_to_path` method is called.
+        Even if that actor is located on another node, the created file is automatically
+        synced to the local node.
+
+        Args:
+            path: The path to the directory to save the state of the implementing class
+                to. If `path` doesn't exist or is None, then a new directory will be
+                created (and returned).
+            state: An optional state dict to be used instead of getting a new state of
+                the implementing class through `self.get_state()`.
+            filesystem: PyArrow FileSystem to use to access data at the `path`.
+                If not specified, this is inferred from the URI scheme of `path`.
+            use_msgpack: Whether the state file should be written using msgpack and
+                msgpack_numpy (file extension is `.msgpack`), rather than pickle (file
+                extension is `.pkl`).
+
+        Returns:
+            The path (str) where the state has been saved.
+        """
+
+        # If no path is given create a local temporary directory.
+        if path is None:
+            import uuid
+
+            # Get the location of the temporary directory on the OS.
+            tmp_dir = pathlib.Path(tempfile.gettempdir())
+            # Create a random directory name.
+            random_dir_name = str(uuid.uuid4())
+            # Create the path, but do not craet the directory on the
+            # filesystem, yet. This is done by `PyArrow`.
+            path = path or tmp_dir / random_dir_name
+
+        # We need a string path for `pyarrow.fs.FileSystem.from_uri`.
+        path = path if isinstance(path, str) else path.as_posix()
+
+        # If we have no filesystem, figure it out.
+        if path and not filesystem:
+            # Note the path needs to be a path that is relative to the
+            # filesystem (e.g. `gs://tmp/...` -> `tmp/...`).
+            filesystem, path = pyarrow.fs.FileSystem.from_uri(path)
+
+        # Make sure, path exists.
+        filesystem.create_dir(path, recursive=True)
+
+        # Convert to `pathlib.Path` for easy handling.
+        path = pathlib.Path(path)
+
+        # Write metadata file to disk.
+        metadata = self.get_metadata()
+        if "checkpoint_version" not in metadata:
+            metadata["checkpoint_version"] = str(
+                CHECKPOINT_VERSION_LEARNER_AND_ENV_RUNNER
+            )
+        with filesystem.open_output_stream(
+            (path / self.METADATA_FILE_NAME).as_posix()
+        ) as f:
+            f.write(json.dumps(metadata).encode("utf-8"))
+
+        # Write the class and constructor args information to disk. Always use pickle
+        # for this, because this information contains classes and maybe other
+        # non-serializable data.
+        with filesystem.open_output_stream(
+            (path / self.CLASS_AND_CTOR_ARGS_FILE_NAME).as_posix()
+        ) as f:
+            pickle.dump(
+                {
+                    "class": type(self),
+                    "ctor_args_and_kwargs": self.get_ctor_args_and_kwargs(),
+                },
+                f,
+            )
+
+        # Get the entire state of this Checkpointable, or use provided `state`.
+        _state_provided = state is not None
+        state = state or self.get_state(
+            not_components=[c[0] for c in self.get_checkpointable_components()]
+        )
+
+        # Write components of `self` that themselves are `Checkpointable`.
+        for comp_name, comp in self.get_checkpointable_components():
+            # If subcomponent's name is not in `state`, ignore it and don't write this
+            # subcomponent's state to disk.
+            if _state_provided and comp_name not in state:
+                continue
+            comp_path = path / comp_name
+
+            # If component is an ActorManager, save the manager's first healthy
+            # actor's state to disk (even if it's on another node, in which case, we'll
+            # sync the generated file(s) back to this node).
+            if isinstance(comp, FaultTolerantActorManager):
+                actor_to_use = comp.healthy_actor_ids()[0]
+
+                def _get_ip(_=None):
+                    import ray
+
+                    return ray.util.get_node_ip_address()
+
+                _result = next(
+                    iter(
+                        comp.foreach_actor(
+                            _get_ip,
+                            remote_actor_ids=[actor_to_use],
+                        )
+                    )
+                )
+                if not _result.ok:
+                    raise _result.get()
+                worker_ip_addr = _result.get()
+                self_ip_addr = _get_ip()
+
+                # Save the state to a temporary location on the `actor_to_use`'s
+                # node.
+                comp_state_ref = None
+                if _state_provided:
+                    comp_state_ref = ray.put(state.pop(comp_name))
+
+                if worker_ip_addr == self_ip_addr:
+                    comp.foreach_actor(
+                        lambda w, _path=comp_path, _state=comp_state_ref, _use_msgpack=use_msgpack: (  # noqa
+                            w.save_to_path(
+                                _path,
+                                state=(
+                                    ray.get(_state)
+                                    if _state is not None
+                                    else w.get_state()
+                                ),
+                                use_msgpack=_use_msgpack,
+                            )
+                        ),
+                        remote_actor_ids=[actor_to_use],
+                    )
+                else:
+                    # Save the checkpoint to the temporary directory on the worker.
+                    def _save(w, _state=comp_state_ref, _use_msgpack=use_msgpack):
+                        import tempfile
+
+                        # Create a temporary directory on the worker.
+                        tmpdir = tempfile.mkdtemp()
+                        w.save_to_path(
+                            tmpdir,
+                            state=(
+                                ray.get(_state) if _state is not None else w.get_state()
+                            ),
+                            use_msgpack=_use_msgpack,
+                        )
+                        return tmpdir
+
+                    _result = next(
+                        iter(comp.foreach_actor(_save, remote_actor_ids=[actor_to_use]))
+                    )
+                    if not _result.ok:
+                        raise _result.get()
+                    worker_temp_dir = _result.get()
+
+                    # Sync the temporary directory from the worker to this node.
+                    sync_dir_between_nodes(
+                        worker_ip_addr,
+                        worker_temp_dir,
+                        self_ip_addr,
+                        str(comp_path),
+                    )
+
+                    # Remove the temporary directory on the worker.
+                    def _rmdir(_, _dir=worker_temp_dir):
+                        import shutil
+
+                        shutil.rmtree(_dir)
+
+                    comp.foreach_actor(_rmdir, remote_actor_ids=[actor_to_use])
+
+            # Local component (instance stored in a property of `self`).
+            else:
+                if _state_provided:
+                    comp_state = state.pop(comp_name)
+                else:
+                    comp_state = self.get_state(components=comp_name)[comp_name]
+                # By providing the `state` arg, we make sure that the component does not
+                # have to call its own `get_state()` anymore, but uses what's provided
+                # here.
+                comp.save_to_path(
+                    comp_path,
+                    filesystem=filesystem,
+                    state=comp_state,
+                    use_msgpack=use_msgpack,
+                )
+
+        # Write all the remaining state to disk.
+        filename = path / (
+            self.STATE_FILE_NAME + (".msgpack" if use_msgpack else ".pkl")
+        )
+        with filesystem.open_output_stream(filename.as_posix()) as f:
+            if use_msgpack:
+                msgpack = try_import_msgpack(error=True)
+                msgpack.dump(state, f)
+            else:
+                pickle.dump(state, f)
+
+        return str(path)
+
+    def restore_from_path(
+        self,
+        path: Union[str, pathlib.Path],
+        *,
+        component: Optional[str] = None,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        **kwargs,
+    ) -> None:
+        """Restores the state of the implementing class from the given path.
+
+        If the `component` arg is provided, `path` refers to a checkpoint of a
+        subcomponent of `self`, thus allowing the user to load only the subcomponent's
+        state into `self` without affecting any of the other state information (for
+        example, loading only the NN state into a Checkpointable, which contains such
+        an NN, but also has other state information that should NOT be changed by
+        calling this method).
+
+        The given `path` should have the following structure and contain the following
+        files:
+
+        .. testcode::
+            :skipif: True
+
+            path/
+                [component1]/
+                    [component1 subcomponentA]/
+                        ...
+                    [component1 subcomponentB]/
+                        ...
+                [component2]/
+                        ...
+                [cls.METADATA_FILE_NAME] (json)
+                [cls.STATE_FILE_NAME] (pkl|msgpack)
+
+        Note that the self.METADATA_FILE_NAME file is not required to restore the state.
+
+        Args:
+            path: The path to load the implementing class' state from or to load the
+                state of only one subcomponent's state of the implementing class (if
+                `component` is provided).
+            component: If provided, `path` is interpreted as the checkpoint path of only
+                the subcomponent and thus, only that subcomponent's state is
+                restored/loaded. All other state of `self` remains unchanged in this
+                case.
+            filesystem: PyArrow FileSystem to use to access data at the `path`. If not
+                specified, this is inferred from the URI scheme of `path`.
+            **kwargs: Forward compatibility kwargs.
+        """
+        path = path if isinstance(path, str) else path.as_posix()
+
+        if path and not filesystem:
+            # Note the path needs to be a path that is relative to the
+            # filesystem (e.g. `gs://tmp/...` -> `tmp/...`).
+            filesystem, path = pyarrow.fs.FileSystem.from_uri(path)
+        # Only here convert to a `Path` instance b/c otherwise
+        # cloud path gets broken (i.e. 'gs://' -> 'gs:/').
+        path = pathlib.Path(path)
+
+        if not _exists_at_fs_path(filesystem, path.as_posix()):
+            raise FileNotFoundError(f"`path` ({path}) not found!")
+
+        # Restore components of `self` that themselves are `Checkpointable`.
+        orig_comp_names = {c[0] for c in self.get_checkpointable_components()}
+        self._restore_all_subcomponents_from_path(
+            path, filesystem, component=component, **kwargs
+        )
+
+        # Restore the "base" state (not individual subcomponents).
+        if component is None:
+            filename = path / self.STATE_FILE_NAME
+            if filename.with_suffix(".msgpack").is_file():
+                msgpack = try_import_msgpack(error=True)
+                with filesystem.open_input_stream(
+                    filename.with_suffix(".msgpack").as_posix()
+                ) as f:
+                    state = msgpack.load(f, strict_map_key=False)
+            else:
+                with filesystem.open_input_stream(
+                    filename.with_suffix(".pkl").as_posix()
+                ) as f:
+                    state = pickle.load(f)
+            self.set_state(state)
+
+            new_comp_names = {c[0] for c in self.get_checkpointable_components()}
+            diff_comp_names = new_comp_names - orig_comp_names
+            if diff_comp_names:
+                self._restore_all_subcomponents_from_path(
+                    path, filesystem, only_comp_names=diff_comp_names, **kwargs
+                )
+
+    @classmethod
+    def from_checkpoint(
+        cls,
+        path: Union[str, pathlib.Path],
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        **kwargs,
+    ) -> "Checkpointable":
+        """Creates a new Checkpointable instance from the given location and returns it.
+
+        Args:
+            path: The checkpoint path to load (a) the information on how to construct
+                a new instance of the implementing class and (b) the state to restore
+                the created instance to.
+            filesystem: PyArrow FileSystem to use to access data at the `path`. If not
+                specified, this is inferred from the URI scheme of `path`.
+            kwargs: Forward compatibility kwargs. Note that these kwargs are sent to
+                each subcomponent's `from_checkpoint()` call.
+
+        Returns:
+             A new instance of the implementing class, already set to the state stored
+             under `path`.
+        """
+        # We need a string path for the `PyArrow` filesystem.
+        path = path if isinstance(path, str) else path.as_posix()
+
+        # If no filesystem is passed in create one.
+        if path and not filesystem:
+            # Note the path needs to be a path that is relative to the
+            # filesystem (e.g. `gs://tmp/...` -> `tmp/...`).
+            filesystem, path = pyarrow.fs.FileSystem.from_uri(path)
+        # Only here convert to a `Path` instance b/c otherwise
+        # cloud path gets broken (i.e. 'gs://' -> 'gs:/').
+        path = pathlib.Path(path)
+
+        # Get the class constructor to call and its args/kwargs.
+        # Try reading the pickle file first.
+        try:
+            with filesystem.open_input_stream(
+                (path / cls.CLASS_AND_CTOR_ARGS_FILE_NAME).as_posix()
+            ) as f:
+                ctor_info = pickle.load(f)
+            ctor = ctor_info["class"]
+            ctor_args = force_list(ctor_info["ctor_args_and_kwargs"][0])
+            ctor_kwargs = ctor_info["ctor_args_and_kwargs"][1]
+
+            # Inspect the ctor to see, which arguments in ctor_info should be replaced
+            # with the user provided **kwargs.
+            for i, (param_name, param) in enumerate(
+                inspect.signature(ctor).parameters.items()
+            ):
+                if param_name in kwargs:
+                    val = kwargs.pop(param_name)
+                    if (
+                        param.kind == inspect._ParameterKind.POSITIONAL_OR_KEYWORD
+                        and len(ctor_args) > i
+                    ):
+                        ctor_args[i] = val
+                    else:
+                        ctor_kwargs[param_name] = val
+
+        # If the pickle file is from another python version, use provided
+        # args instead.
+        except Exception:
+            # Use class that this method was called on.
+            ctor = cls
+            # Use only user provided **kwargs.
+            ctor_args = []
+            ctor_kwargs = kwargs
+
+        # Check, whether the constructor actually goes together with `cls`.
+        if not issubclass(ctor, cls):
+            raise ValueError(
+                f"The class ({ctor}) stored in checkpoint ({path}) does not seem to be "
+                f"a subclass of `cls` ({cls})!"
+            )
+        elif not issubclass(ctor, Checkpointable):
+            raise ValueError(
+                f"The class ({ctor}) stored in checkpoint ({path}) does not seem to be "
+                "an implementer of the `Checkpointable` API!"
+            )
+
+        # Construct the initial object (without any particular state).
+        obj = ctor(*ctor_args, **ctor_kwargs)
+        # Restore the state of the constructed object.
+        obj.restore_from_path(path, filesystem=filesystem, **kwargs)
+        # Return the new object.
+        return obj
+
+    @abc.abstractmethod
+    def get_state(
+        self,
+        components: Optional[Union[str, Collection[str]]] = None,
+        *,
+        not_components: Optional[Union[str, Collection[str]]] = None,
+        **kwargs,
+    ) -> StateDict:
+        """Returns the implementing class's current state as a dict.
+
+        The returned dict must only contain msgpack-serializable data if you want to
+        use the `AlgorithmConfig._msgpack_checkpoints` option. Consider returning your
+        non msgpack-serializable data from the `Checkpointable.get_ctor_args_and_kwargs`
+        method, instead.
+
+        Args:
+            components: An optional collection of string keys to be included in the
+                returned state. This might be useful, if getting certain components
+                of the state is expensive (e.g. reading/compiling the weights of a large
+                NN) and at the same time, these components are not required by the
+                caller.
+            not_components: An optional list of string keys to be excluded in the
+                returned state, even if the same string is part of `components`.
+                This is useful to get the complete state of the class, except
+                one or a few components.
+            kwargs: Forward-compatibility kwargs.
+
+        Returns:
+            The current state of the implementing class (or only the `components`
+            specified, w/o those in `not_components`).
+        """
+
+    @abc.abstractmethod
+    def set_state(self, state: StateDict) -> None:
+        """Sets the implementing class' state to the given state dict.
+
+        If component keys are missing in `state`, these components of the implementing
+        class will not be updated/set.
+
+        Args:
+            state: The state dict to restore the state from. Maps component keys
+                to the corresponding subcomponent's own state.
+        """
+
+    @abc.abstractmethod
+    def get_ctor_args_and_kwargs(self) -> Tuple[Tuple, Dict[str, Any]]:
+        """Returns the args/kwargs used to create `self` from its constructor.
+
+        Returns:
+            A tuple of the args (as a tuple) and kwargs (as a Dict[str, Any]) used to
+            construct `self` from its class constructor.
+        """
+
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def get_metadata(self) -> Dict:
+        """Returns JSON writable metadata further describing the implementing class.
+
+        Note that this metadata is NOT part of any state and is thus NOT needed to
+        restore the state of a Checkpointable instance from a directory. Rather, the
+        metadata will be written into `self.METADATA_FILE_NAME` when calling
+        `self.save_to_path()` for the user's convenience.
+
+        Returns:
+            A JSON-encodable dict of metadata information.
+        """
+        return {
+            "class_and_ctor_args_file": self.CLASS_AND_CTOR_ARGS_FILE_NAME,
+            "state_file": self.STATE_FILE_NAME,
+            "ray_version": ray.__version__,
+            "ray_commit": ray.__commit__,
+        }
+
+    def get_checkpointable_components(self) -> List[Tuple[str, "Checkpointable"]]:
+        """Returns the implementing class's own Checkpointable subcomponents.
+
+        Returns:
+            A list of 2-tuples (name, subcomponent) describing the implementing class'
+            subcomponents, all of which have to be `Checkpointable` themselves and
+            whose state is therefore written into subdirectories (rather than the main
+            state file (self.STATE_FILE_NAME) when calling `self.save_to_path()`).
+        """
+        return []
+
+    def _check_component(self, name, components, not_components) -> bool:
+        comp_list = force_list(components)
+        not_comp_list = force_list(not_components)
+        if (
+            components is None
+            or any(c.startswith(name + "/") for c in comp_list)
+            or name in comp_list
+        ) and (not_components is None or name not in not_comp_list):
+            return True
+        return False
+
+    def _get_subcomponents(self, name, components):
+        if components is None:
+            return None
+
+        components = force_list(components)
+        subcomponents = []
+        for comp in components:
+            if comp.startswith(name + "/"):
+                subcomponents.append(comp[len(name) + 1 :])
+
+        return None if not subcomponents else subcomponents
+
+    def _restore_all_subcomponents_from_path(
+        self, path, filesystem, only_comp_names=None, component=None, **kwargs
+    ):
+        for comp_name, comp in self.get_checkpointable_components():
+            if only_comp_names is not None and comp_name not in only_comp_names:
+                continue
+
+            # The value of the `component` argument for the upcoming
+            # `[subcomponent].restore_from_path(.., component=..)` call.
+            comp_arg = None
+
+            if component is None:
+                comp_dir = path / comp_name
+                # If subcomponent's dir is not in path, ignore it and don't restore this
+                # subcomponent's state from disk.
+                if not _exists_at_fs_path(filesystem, comp_dir.as_posix()):
+                    continue
+            else:
+                comp_dir = path
+
+                # `component` is a path that starts with `comp` -> Remove the name of
+                # `comp` from the `component` arg in the upcoming call to `restore_..`.
+                if component.startswith(comp_name + "/"):
+                    comp_arg = component[len(comp_name) + 1 :]
+                # `component` has nothing to do with `comp` -> Skip.
+                elif component != comp_name:
+                    continue
+
+            # If component is an ActorManager, restore all the manager's healthy
+            # actors' states from disk (even if they are on another node, in which case,
+            # we'll sync checkpoint file(s) to the respective node).
+            if isinstance(comp, FaultTolerantActorManager):
+                head_node_ip = ray.util.get_node_ip_address()
+                all_healthy_actors = comp.healthy_actor_ids()
+
+                def _restore(
+                    w,
+                    _kwargs=MappingProxyType(kwargs),
+                    _path=comp_dir,
+                    _head_ip=head_node_ip,
+                    _comp_arg=comp_arg,
+                ):
+                    import ray
+                    import tempfile
+
+                    worker_node_ip = ray.util.get_node_ip_address()
+                    # If the worker is on the same node as the head, load the checkpoint
+                    # directly from the path otherwise sync the checkpoint from the head
+                    # to the worker and load it from there.
+                    if worker_node_ip == _head_ip:
+                        w.restore_from_path(_path, component=_comp_arg, **_kwargs)
+                    else:
+                        with tempfile.TemporaryDirectory() as temp_dir:
+                            sync_dir_between_nodes(
+                                _head_ip, _path, worker_node_ip, temp_dir
+                            )
+                            w.restore_from_path(
+                                temp_dir, component=_comp_arg, **_kwargs
+                            )
+
+                comp.foreach_actor(_restore, remote_actor_ids=all_healthy_actors)
+
+            # Call `restore_from_path()` on local subcomponent, thereby passing in the
+            # **kwargs.
+            else:
+                comp.restore_from_path(
+                    comp_dir, filesystem=filesystem, component=comp_arg, **kwargs
+                )
+
+
+def _exists_at_fs_path(fs: pyarrow.fs.FileSystem, path: str) -> bool:
+    """Returns `True` if the path can be found in the filesystem."""
+    valid = fs.get_file_info(path)
+    return valid.type != pyarrow.fs.FileType.NotFound
+
+
+def _is_dir(file_info: pyarrow.fs.FileInfo) -> bool:
+    """Returns `True`, if the file info is from a directory."""
+    return file_info.type == pyarrow.fs.FileType.Directory
+
+
+@OldAPIStack
+def get_checkpoint_info(
+    checkpoint: Union[str, Checkpoint],
+    filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+) -> Dict[str, Any]:
+    """Returns a dict with information about an Algorithm/Policy checkpoint.
+
+    If the given checkpoint is a >=v1.0 checkpoint directory, try reading all
+    information from the contained `rllib_checkpoint.json` file.
+
+    Args:
+        checkpoint: The checkpoint directory (str) or an AIR Checkpoint object.
+        filesystem: PyArrow FileSystem to use to access data at the `checkpoint`. If not
+            specified, this is inferred from the URI scheme provided by `checkpoint`.
+
+    Returns:
+        A dict containing the keys:
+        "type": One of "Policy" or "Algorithm".
+        "checkpoint_version": A version tuple, e.g. v1.0, indicating the checkpoint
+        version. This will help RLlib to remain backward compatible wrt. future
+        Ray and checkpoint versions.
+        "checkpoint_dir": The directory with all the checkpoint files in it. This might
+        be the same as the incoming `checkpoint` arg.
+        "state_file": The main file with the Algorithm/Policy's state information in it.
+        This is usually a pickle-encoded file.
+        "policy_ids": An optional set of PolicyIDs in case we are dealing with an
+        Algorithm checkpoint. None if `checkpoint` is a Policy checkpoint.
+    """
+    # Default checkpoint info.
+    info = {
+        "type": "Algorithm",
+        "format": "cloudpickle",
+        "checkpoint_version": CHECKPOINT_VERSION,
+        "checkpoint_dir": None,
+        "state_file": None,
+        "policy_ids": None,
+        "module_ids": None,
+    }
+
+    # `checkpoint` is a Checkpoint instance: Translate to directory and continue.
+    if isinstance(checkpoint, Checkpoint):
+        checkpoint = checkpoint.to_directory()
+
+    if checkpoint and not filesystem:
+        # Note the path needs to be a path that is relative to the
+        # filesystem (e.g. `gs://tmp/...` -> `tmp/...`).
+        filesystem, checkpoint = pyarrow.fs.FileSystem.from_uri(checkpoint)
+    # Only here convert to a `Path` instance b/c otherwise
+    # cloud path gets broken (i.e. 'gs://' -> 'gs:/').
+    checkpoint = pathlib.Path(checkpoint)
+
+    # Checkpoint is dir.
+    if _exists_at_fs_path(filesystem, checkpoint.as_posix()) and _is_dir(
+        filesystem.get_file_info(checkpoint.as_posix())
+    ):
+        info.update({"checkpoint_dir": str(checkpoint)})
+
+        # Figure out whether this is an older checkpoint format
+        # (with a `checkpoint-\d+` file in it).
+        file_info_list = filesystem.get_file_info(
+            pyarrow.fs.FileSelector(checkpoint.as_posix(), recursive=False)
+        )
+        for file_info in file_info_list:
+            if file_info.is_file:
+                if re.match("checkpoint-\\d+", file_info.base_name):
+                    info.update(
+                        {
+                            "checkpoint_version": version.Version("0.1"),
+                            "state_file": str(file_info.base_name),
+                        }
+                    )
+                    return info
+
+        # No old checkpoint file found.
+
+        # If rllib_checkpoint.json file present, read available information from it
+        # and then continue with the checkpoint analysis (possibly overriding further
+        # information).
+        if _exists_at_fs_path(
+            filesystem, (checkpoint / "rllib_checkpoint.json").as_posix()
+        ):
+            # if (checkpoint / "rllib_checkpoint.json").is_file():
+            with filesystem.open_input_stream(
+                (checkpoint / "rllib_checkpoint.json").as_posix()
+            ) as f:
+                # with open(checkpoint / "rllib_checkpoint.json") as f:
+                rllib_checkpoint_info = json.load(fp=f)
+            if "checkpoint_version" in rllib_checkpoint_info:
+                rllib_checkpoint_info["checkpoint_version"] = version.Version(
+                    rllib_checkpoint_info["checkpoint_version"]
+                )
+            info.update(rllib_checkpoint_info)
+        else:
+            # No rllib_checkpoint.json file present: Warn and continue trying to figure
+            # out checkpoint info ourselves.
+            if log_once("no_rllib_checkpoint_json_file"):
+                logger.warning(
+                    "No `rllib_checkpoint.json` file found in checkpoint directory "
+                    f"{checkpoint}! Trying to extract checkpoint info from other files "
+                    f"found in that dir."
+                )
+
+        # Policy checkpoint file found.
+        for extension in ["pkl", "msgpck"]:
+            if _exists_at_fs_path(
+                filesystem, (checkpoint / ("policy_state." + extension)).as_posix()
+            ):
+                # if (checkpoint / ("policy_state." + extension)).is_file():
+                info.update(
+                    {
+                        "type": "Policy",
+                        "format": "cloudpickle" if extension == "pkl" else "msgpack",
+                        "checkpoint_version": CHECKPOINT_VERSION,
+                        "state_file": str(checkpoint / f"policy_state.{extension}"),
+                    }
+                )
+                return info
+
+        # Valid Algorithm checkpoint >v0 file found?
+        format = None
+        for extension in ["pkl", "msgpck", "msgpack"]:
+            state_file = checkpoint / f"algorithm_state.{extension}"
+            if (
+                _exists_at_fs_path(filesystem, state_file.as_posix())
+                and filesystem.get_file_info(state_file.as_posix()).is_file
+            ):
+                format = "cloudpickle" if extension == "pkl" else "msgpack"
+                break
+        if format is None:
+            raise ValueError(
+                "Given checkpoint does not seem to be valid! No file with the name "
+                "`algorithm_state.[pkl|msgpack|msgpck]` (or `checkpoint-[0-9]+`) found."
+            )
+
+        info.update(
+            {
+                "format": format,
+                "state_file": str(state_file),
+            }
+        )
+
+        # Collect all policy IDs in the sub-dir "policies/".
+        policies_dir = checkpoint / "policies"
+        if _exists_at_fs_path(filesystem, policies_dir.as_posix()) and _is_dir(
+            filesystem.get_file_info(policies_dir.as_posix())
+        ):
+            policy_ids = set()
+            file_info_list = filesystem.get_file_info(
+                pyarrow.fs.FileSelector(policies_dir.as_posix(), recursive=False)
+            )
+            for file_info in file_info_list:
+                policy_ids.add(file_info.base_name)
+            info.update({"policy_ids": policy_ids})
+
+        # Collect all module IDs in the sub-dir "learner/module_state/".
+        modules_dir = (
+            checkpoint
+            / COMPONENT_LEARNER_GROUP
+            / COMPONENT_LEARNER
+            / COMPONENT_RL_MODULE
+        )
+        if _exists_at_fs_path(filesystem, checkpoint.as_posix()) and _is_dir(
+            filesystem.get_file_info(modules_dir.as_posix())
+        ):
+            module_ids = set()
+            file_info_list = filesystem.get_file_info(
+                pyarrow.fs.FileSelector(modules_dir.as_posix(), recursive=False)
+            )
+            for file_info in file_info_list:
+                # Only add subdirs (those are the ones where the RLModule data
+                # is stored, not files (could be json metadata files).
+                module_dir = modules_dir / file_info.base_name
+                if _is_dir(filesystem.get_file_info(module_dir.as_posix())):
+                    module_ids.add(file_info.base_name)
+            info.update({"module_ids": module_ids})
+
+    # Checkpoint is a file: Use as-is (interpreting it as old Algorithm checkpoint
+    # version).
+    elif (
+        _exists_at_fs_path(filesystem, checkpoint.as_posix())
+        and filesystem.get_file_info(checkpoint.as_posix()).is_file
+    ):
+        info.update(
+            {
+                "checkpoint_version": version.Version("0.1"),
+                "checkpoint_dir": str(checkpoint.parent),
+                "state_file": str(checkpoint),
+            }
+        )
+
+    else:
+        raise ValueError(
+            f"Given checkpoint ({str(checkpoint)}) not found! Must be a "
+            "checkpoint directory (or a file for older checkpoint versions)."
+        )
+
+    return info
+
+
+@OldAPIStack
+def convert_to_msgpack_checkpoint(
+    checkpoint: Union[str, Checkpoint],
+    msgpack_checkpoint_dir: str,
+) -> str:
+    """Converts an Algorithm checkpoint (pickle based) to a msgpack based one.
+
+    Msgpack has the advantage of being python version independent.
+
+    Args:
+        checkpoint: The directory, in which to find the Algorithm checkpoint (pickle
+            based).
+        msgpack_checkpoint_dir: The directory, in which to create the new msgpack
+            based checkpoint.
+
+    Returns:
+        The directory in which the msgpack checkpoint has been created. Note that
+        this is the same as `msgpack_checkpoint_dir`.
+    """
+    from ray.rllib.algorithms import Algorithm
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+    from ray.rllib.core.rl_module import validate_module_id
+
+    # Try to import msgpack and msgpack_numpy.
+    msgpack = try_import_msgpack(error=True)
+
+    # Restore the Algorithm using the python version dependent checkpoint.
+    algo = Algorithm.from_checkpoint(checkpoint)
+    state = algo.__getstate__()
+
+    # Convert all code in state into serializable data.
+    # Serialize the algorithm class.
+    state["algorithm_class"] = serialize_type(state["algorithm_class"])
+    # Serialize the algorithm's config object.
+    if not isinstance(state["config"], dict):
+        state["config"] = state["config"].serialize()
+    else:
+        state["config"] = AlgorithmConfig._serialize_dict(state["config"])
+
+    # Extract policy states from worker state (Policies get their own
+    # checkpoint sub-dirs).
+    policy_states = {}
+    if "worker" in state and "policy_states" in state["worker"]:
+        policy_states = state["worker"].pop("policy_states", {})
+
+    # Policy mapping fn.
+    state["worker"]["policy_mapping_fn"] = NOT_SERIALIZABLE
+    # Is Policy to train function.
+    state["worker"]["is_policy_to_train"] = NOT_SERIALIZABLE
+
+    # Add RLlib checkpoint version (as string).
+    state["checkpoint_version"] = str(CHECKPOINT_VERSION)
+
+    # Write state (w/o policies) to disk.
+    state_file = os.path.join(msgpack_checkpoint_dir, "algorithm_state.msgpck")
+    with open(state_file, "wb") as f:
+        msgpack.dump(state, f)
+
+    # Write rllib_checkpoint.json.
+    with open(os.path.join(msgpack_checkpoint_dir, "rllib_checkpoint.json"), "w") as f:
+        json.dump(
+            {
+                "type": "Algorithm",
+                "checkpoint_version": state["checkpoint_version"],
+                "format": "msgpack",
+                "state_file": state_file,
+                "policy_ids": list(policy_states.keys()),
+                "ray_version": ray.__version__,
+                "ray_commit": ray.__commit__,
+            },
+            f,
+        )
+
+    # Write individual policies to disk, each in their own subdirectory.
+    for pid, policy_state in policy_states.items():
+        # From here on, disallow policyIDs that would not work as directory names.
+        validate_module_id(pid, error=True)
+        policy_dir = os.path.join(msgpack_checkpoint_dir, "policies", pid)
+        os.makedirs(policy_dir, exist_ok=True)
+        policy = algo.get_policy(pid)
+        policy.export_checkpoint(
+            policy_dir,
+            policy_state=policy_state,
+            checkpoint_format="msgpack",
+        )
+
+    # Release all resources used by the Algorithm.
+    algo.stop()
+
+    return msgpack_checkpoint_dir
+
+
+@OldAPIStack
+def convert_to_msgpack_policy_checkpoint(
+    policy_checkpoint: Union[str, Checkpoint],
+    msgpack_checkpoint_dir: str,
+) -> str:
+    """Converts a Policy checkpoint (pickle based) to a msgpack based one.
+
+    Msgpack has the advantage of being python version independent.
+
+    Args:
+        policy_checkpoint: The directory, in which to find the Policy checkpoint (pickle
+            based).
+        msgpack_checkpoint_dir: The directory, in which to create the new msgpack
+            based checkpoint.
+
+    Returns:
+        The directory in which the msgpack checkpoint has been created. Note that
+        this is the same as `msgpack_checkpoint_dir`.
+    """
+    from ray.rllib.policy.policy import Policy
+
+    policy = Policy.from_checkpoint(policy_checkpoint)
+
+    os.makedirs(msgpack_checkpoint_dir, exist_ok=True)
+    policy.export_checkpoint(
+        msgpack_checkpoint_dir,
+        policy_state=policy.get_state(),
+        checkpoint_format="msgpack",
+    )
+
+    # Release all resources used by the Policy.
+    del policy
+
+    return msgpack_checkpoint_dir
+
+
+@PublicAPI
+def try_import_msgpack(error: bool = False):
+    """Tries importing msgpack and msgpack_numpy and returns the patched msgpack module.
+
+    Returns None if error is False and msgpack or msgpack_numpy is not installed.
+    Raises an error, if error is True and the modules could not be imported.
+
+    Args:
+        error: Whether to raise an error if msgpack/msgpack_numpy cannot be imported.
+
+    Returns:
+        The `msgpack` module.
+
+    Raises:
+        ImportError: If error=True and msgpack/msgpack_numpy is not installed.
+    """
+    try:
+        import msgpack
+        import msgpack_numpy
+
+        # Make msgpack_numpy look like msgpack.
+        msgpack_numpy.patch()
+
+        return msgpack
+
+    except Exception:
+        if error:
+            raise ImportError(
+                "Could not import or setup msgpack and msgpack_numpy! "
+                "Try running `pip install msgpack msgpack_numpy` first."
+            )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/deprecation.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f5dd0e78b8baa7a3df81cebacaeb25a25a035ae
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/deprecation.py
@@ -0,0 +1,134 @@
+import inspect
+import logging
+from typing import Optional, Union
+
+from ray.util import log_once
+from ray.util.annotations import _mark_annotated
+
+logger = logging.getLogger(__name__)
+
+# A constant to use for any configuration that should be deprecated
+# (to check, whether this config has actually been assigned a proper value or
+# not).
+DEPRECATED_VALUE = -1
+
+
+def deprecation_warning(
+    old: str,
+    new: Optional[str] = None,
+    *,
+    help: Optional[str] = None,
+    error: Optional[Union[bool, Exception]] = None,
+) -> None:
+    """Warns (via the `logger` object) or throws a deprecation warning/error.
+
+    Args:
+        old: A description of the "thing" that is to be deprecated.
+        new: A description of the new "thing" that replaces it.
+        help: An optional help text to tell the user, what to
+            do instead of using `old`.
+        error: Whether or which exception to raise. If True, raise ValueError.
+            If False, just warn. If `error` is-a subclass of Exception,
+            raise that Exception.
+
+    Raises:
+        ValueError: If `error=True`.
+        Exception: Of type `error`, iff `error` is a sub-class of `Exception`.
+    """
+    msg = "`{}` has been deprecated.{}".format(
+        old, (" Use `{}` instead.".format(new) if new else f" {help}" if help else "")
+    )
+
+    if error:
+        if not isinstance(error, bool) and issubclass(error, Exception):
+            # error is an Exception
+            raise error(msg)
+        else:
+            # error is a boolean, construct ValueError ourselves
+            raise ValueError(msg)
+    else:
+        logger.warning(
+            "DeprecationWarning: " + msg + " This will raise an error in the future!"
+        )
+
+
+def Deprecated(old=None, *, new=None, help=None, error):
+    """Decorator for documenting a deprecated class, method, or function.
+
+    Automatically adds a `deprecation.deprecation_warning(old=...,
+    error=False)` to not break existing code at this point to the decorated
+    class' constructor, method, or function.
+
+    In a next major release, this warning should then be made an error
+    (by setting error=True), which means at this point that the
+    class/method/function is no longer supported, but will still inform
+    the user about the deprecation event.
+
+    In a further major release, the class, method, function should be erased
+    entirely from the codebase.
+
+
+    .. testcode::
+        :skipif: True
+
+        from ray.rllib.utils.deprecation import Deprecated
+        # Deprecated class: Patches the constructor to warn if the class is
+        # used.
+        @Deprecated(new="NewAndMuchCoolerClass", error=False)
+        class OldAndUncoolClass:
+            ...
+
+        # Deprecated class method: Patches the method to warn if called.
+        class StillCoolClass:
+            ...
+            @Deprecated(new="StillCoolClass.new_and_much_cooler_method()",
+                        error=False)
+            def old_and_uncool_method(self, uncool_arg):
+                ...
+
+        # Deprecated function: Patches the function to warn if called.
+        @Deprecated(new="new_and_much_cooler_function", error=False)
+        def old_and_uncool_function(*uncool_args):
+            ...
+    """
+
+    def _inner(obj):
+        # A deprecated class.
+        if inspect.isclass(obj):
+            # Patch the class' init method to raise the warning/error.
+            obj_init = obj.__init__
+
+            def patched_init(*args, **kwargs):
+                if log_once(old or obj.__name__):
+                    deprecation_warning(
+                        old=old or obj.__name__,
+                        new=new,
+                        help=help,
+                        error=error,
+                    )
+                return obj_init(*args, **kwargs)
+
+            obj.__init__ = patched_init
+            _mark_annotated(obj)
+            # Return the patched class (with the warning/error when
+            # instantiated).
+            return obj
+
+        # A deprecated class method or function.
+        # Patch with the warning/error at the beginning.
+        def _ctor(*args, **kwargs):
+            if log_once(old or obj.__name__):
+                deprecation_warning(
+                    old=old or obj.__name__,
+                    new=new,
+                    help=help,
+                    error=error,
+                )
+            # Call the deprecated method/function.
+            return obj(*args, **kwargs)
+
+        # Return the patched class method/function.
+        return _ctor
+
+    # Return the prepared decorator.
+    return _inner
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/error.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/error.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b9db4c351a38b8fbb6acf5c660a15db807c1a6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/error.py
@@ -0,0 +1,128 @@
+from ray.rllib.utils.annotations import PublicAPI
+
+
+@PublicAPI
+class UnsupportedSpaceException(Exception):
+    """Error for an unsupported action or observation space."""
+
+    pass
+
+
+@PublicAPI
+class EnvError(Exception):
+    """Error if we encounter an error during RL environment validation."""
+
+    pass
+
+
+@PublicAPI
+class MultiAgentEnvError(Exception):
+    """Error if we encounter an error during MultiAgentEnv stepping/validation."""
+
+    pass
+
+
+@PublicAPI
+class NotSerializable(Exception):
+    """Error if we encounter objects that can't be serialized by ray."""
+
+    pass
+
+
+# -------
+# Error messages
+# -------
+
+# Message explaining there are no GPUs available for the
+# num_gpus=n or num_gpus_per_env_runner=m settings.
+ERR_MSG_NO_GPUS = """Found {} GPUs on your machine (GPU devices found: {})! If your
+    machine does not have any GPUs, you should set the config keys
+    `num_gpus_per_learner` and `num_gpus_per_env_runner` to 0. They may be set to
+    1 by default for your particular RL algorithm."""
+
+ERR_MSG_INVALID_ENV_DESCRIPTOR = """The env string you provided ('{}') is:
+a) Not a supported or -installed environment.
+b) Not a tune-registered environment creator.
+c) Not a valid env class string.
+
+Try one of the following:
+a) For Atari support: `pip install gym[atari] autorom[accept-rom-license]`.
+   For PyBullet support: `pip install pybullet`.
+b) To register your custom env, do `from ray import tune;
+   tune.register('[name]', lambda cfg: [return env obj from here using cfg])`.
+   Then in your config, do `config['env'] = [name]`.
+c) Make sure you provide a fully qualified classpath, e.g.:
+   `ray.rllib.examples.envs.classes.repeat_after_me_env.RepeatAfterMeEnv`
+"""
+
+
+ERR_MSG_OLD_GYM_API = """Your environment ({}) does not abide to the new gymnasium-style API!
+From Ray 2.3 on, RLlib only supports the new (gym>=0.26 or gymnasium) Env APIs.
+{}
+Learn more about the most important changes here:
+https://github.com/openai/gym and here: https://github.com/Farama-Foundation/Gymnasium
+
+In order to fix this problem, do the following:
+
+1) Run `pip install gymnasium` on your command line.
+2) Change all your import statements in your code from
+   `import gym` -> `import gymnasium as gym` OR
+   `from gym.spaces import Discrete` -> `from gymnasium.spaces import Discrete`
+
+For your custom (single agent) gym.Env classes:
+3.1) Either wrap your old Env class via the provided `from gymnasium.wrappers import
+     EnvCompatibility` wrapper class.
+3.2) Alternatively to 3.1:
+ - Change your `reset()` method to have the call signature 'def reset(self, *,
+   seed=None, options=None)'
+ - Return an additional info dict (empty dict should be fine) from your `reset()`
+   method.
+ - Return an additional `truncated` flag from your `step()` method (between `done` and
+   `info`). This flag should indicate, whether the episode was terminated prematurely
+   due to some time constraint or other kind of horizon setting.
+
+For your custom RLlib `MultiAgentEnv` classes:
+4.1) Either wrap your old MultiAgentEnv via the provided
+     `from ray.rllib.env.wrappers.multi_agent_env_compatibility import
+     MultiAgentEnvCompatibility` wrapper class.
+4.2) Alternatively to 4.1:
+ - Change your `reset()` method to have the call signature
+   'def reset(self, *, seed=None, options=None)'
+ - Return an additional per-agent info dict (empty dict should be fine) from your
+   `reset()` method.
+ - Rename `dones` into `terminateds` and only set this to True, if the episode is really
+   done (as opposed to has been terminated prematurely due to some horizon/time-limit
+   setting).
+ - Return an additional `truncateds` per-agent dictionary flag from your `step()`
+   method, including the `__all__` key (100% analogous to your `dones/terminateds`
+   per-agent dict).
+   Return this new `truncateds` dict between `dones/terminateds` and `infos`. This
+   flag should indicate, whether the episode (for some agent or all agents) was
+   terminated prematurely due to some time constraint or other kind of horizon setting.
+"""  # noqa
+
+
+ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL = """Could not save keras model under self[TfPolicy].model.base_model!
+    This is either due to ..
+    a) .. this Policy's ModelV2 not having any `base_model` (tf.keras.Model) property
+    b) .. the ModelV2's `base_model` not being used by the Algorithm and thus its
+       variables not being properly initialized.
+"""  # noqa
+
+ERR_MSG_TORCH_POLICY_CANNOT_SAVE_MODEL = """Could not save torch model under self[TorchPolicy].model!
+    This is most likely due to the fact that you are using an Algorithm that
+    uses a Catalog-generated TorchModelV2 subclass, which is torch.save() cannot pickle.
+"""  # noqa
+
+# -------
+# HOWTO_ strings can be added to any error/warning/into message
+# to eplain to the user, how to actually fix the encountered problem.
+# -------
+
+# HOWTO change the RLlib config, depending on how user runs the job.
+HOWTO_CHANGE_CONFIG = """
+To change the config for `tune.Tuner().fit()` in a script: Modify the python dict
+  passed to `tune.Tuner(param_space=[...]).fit()`.
+To change the config for an RLlib Algorithm instance: Modify the python dict
+  passed to the Algorithm's constructor, e.g. `PPO(config=[...])`.
+"""
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/filter_manager.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/filter_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..b814ee163b84b0c1b61976a053d1e3f833a2376d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/filter_manager.py
@@ -0,0 +1,82 @@
+import logging
+from typing import Optional
+
+import ray
+from ray.rllib.utils.annotations import OldAPIStack
+
+logger = logging.getLogger(__name__)
+
+
+@OldAPIStack
+class FilterManager:
+    """Manages filters and coordination across remote evaluators that expose
+    `get_filters` and `sync_filters`.
+    """
+
+    @staticmethod
+    def synchronize(
+        local_filters,
+        worker_set,
+        update_remote=True,
+        timeout_seconds: Optional[float] = None,
+        use_remote_data_for_update: bool = True,
+    ):
+        """Aggregates filters from remote workers (if use_remote_data_for_update=True).
+
+        Local copy is updated and then broadcasted to all remote evaluators
+        (if `update_remote` is True).
+
+        Args:
+            local_filters: Filters to be synchronized.
+            worker_set: EnvRunnerGroup with remote EnvRunners with filters.
+            update_remote: Whether to push updates from the local filters to the remote
+                workers' filters.
+            timeout_seconds: How long to wait for filter to get or set filters
+            use_remote_data_for_update: Whether to use the `worker_set`'s remote workers
+                to update the local filters. If False, stats from the remote workers
+                will not be used and discarded.
+        """
+        # No sync/update required in either direction -> Early out.
+        if not (update_remote or use_remote_data_for_update):
+            return
+
+        logger.debug(f"Synchronizing filters: {local_filters}")
+
+        # Get the filters from the remote workers.
+        remote_filters = worker_set.foreach_env_runner(
+            func=lambda worker: worker.get_filters(flush_after=True),
+            local_env_runner=False,
+            timeout_seconds=timeout_seconds,
+        )
+        if len(remote_filters) != worker_set.num_healthy_remote_workers():
+            logger.error(
+                "Failed to get remote filters from a rollout worker in "
+                "FilterManager! "
+                "Filtered metrics may be computed, but filtered wrong."
+            )
+
+        # Should we utilize the remote workers' filter stats to update the local
+        # filters?
+        if use_remote_data_for_update:
+            for rf in remote_filters:
+                for k in local_filters:
+                    local_filters[k].apply_changes(rf[k], with_buffer=False)
+
+        # Should we update the remote workers' filters from the (now possibly synched)
+        # local filters?
+        if update_remote:
+            copies = {k: v.as_serializable() for k, v in local_filters.items()}
+            remote_copy = ray.put(copies)
+
+            logger.debug("Updating remote filters ...")
+            results = worker_set.foreach_env_runner(
+                func=lambda worker: worker.sync_filters(ray.get(remote_copy)),
+                local_env_runner=False,
+                timeout_seconds=timeout_seconds,
+            )
+            if len(results) != worker_set.num_healthy_remote_workers():
+                logger.error(
+                    "Failed to set remote filters to a rollout worker in "
+                    "FilterManager. "
+                    "Filtered metrics may be computed, but filtered wrong."
+                )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/framework.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/framework.py
new file mode 100644
index 0000000000000000000000000000000000000000..44143c766f6f100af0c243ad670ea4018967eb1f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/framework.py
@@ -0,0 +1,402 @@
+import logging
+import numpy as np
+import os
+import sys
+from typing import Any, Optional, TYPE_CHECKING
+
+import tree  # pip install dm_tree
+
+from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.typing import (
+    TensorShape,
+    TensorStructType,
+    TensorType,
+)
+
+if TYPE_CHECKING:
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+
+logger = logging.getLogger(__name__)
+
+
+@PublicAPI
+def convert_to_tensor(
+    data: TensorStructType,
+    framework: str,
+    device: Optional[str] = None,
+):
+    """Converts any nested numpy struct into framework-specific tensors.
+
+    Args:
+        data: The input data (numpy) to convert to framework-specific tensors.
+        framework: The framework to convert to. Only "torch" and "tf2" allowed.
+        device: An optional device name (for torch only).
+
+    Returns:
+        The converted tensor struct matching the input data.
+    """
+    if framework == "torch":
+        from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+
+        return convert_to_torch_tensor(data, device=device)
+    elif framework == "tf2":
+        _, tf, _ = try_import_tf()
+
+        return tree.map_structure(lambda s: tf.convert_to_tensor(s), data)
+    raise NotImplementedError(
+        f"framework={framework} not supported in `convert_to_tensor()`!"
+    )
+
+
+@PublicAPI
+def get_device(config: "AlgorithmConfig", num_gpus_requested: int = 1):
+    """Returns a single device (CPU or some GPU) depending on a config.
+
+    Args:
+        config: An AlgorithmConfig to extract information from about the device to use.
+        num_gpus_requested: The number of GPUs actually requested. This may be the value
+            of `config.num_gpus_per_env_runner` when for example calling this function
+            from an EnvRunner.
+
+    Returns:
+        A single device (or name) given `config` and `num_gpus_requested`.
+    """
+    if config.framework_str == "torch":
+        torch, _ = try_import_torch()
+
+        # TODO (Kourosh): How do we handle model parallelism?
+        # TODO (Kourosh): Instead of using _TorchAccelerator, we should use the public
+        #  API in ray.train but allow for session to be None without any errors raised.
+        if num_gpus_requested > 0:
+            from ray.air._internal.torch_utils import get_devices
+
+            # `get_devices()` returns a list that contains the 0th device if
+            # it is called from outside a Ray Train session. It's necessary to give
+            # the user the option to run on the gpu of their choice, so we enable that
+            # option here through `config.local_gpu_idx`.
+            devices = get_devices()
+            if len(devices) == 1:
+                return devices[0]
+            else:
+                assert config.local_gpu_idx < torch.cuda.device_count(), (
+                    f"local_gpu_idx {config.local_gpu_idx} is not a valid GPU ID "
+                    "or is not available."
+                )
+                # This is an index into the available CUDA devices. For example, if
+                # `os.environ["CUDA_VISIBLE_DEVICES"] = "1"` then
+                # `torch.cuda.device_count() = 1` and torch.device(0) maps to that GPU
+                # with ID=1 on the node.
+                return torch.device(config.local_gpu_idx)
+        else:
+            return torch.device("cpu")
+    else:
+        raise NotImplementedError(
+            f"`framework_str` {config.framework_str} not supported!"
+        )
+
+
+@PublicAPI
+def try_import_jax(error: bool = False):
+    """Tries importing JAX and FLAX and returns both modules (or Nones).
+
+    Args:
+        error: Whether to raise an error if JAX/FLAX cannot be imported.
+
+    Returns:
+        Tuple containing the jax- and the flax modules.
+
+    Raises:
+        ImportError: If error=True and JAX is not installed.
+    """
+    if "RLLIB_TEST_NO_JAX_IMPORT" in os.environ:
+        logger.warning("Not importing JAX for test purposes.")
+        return None, None
+
+    try:
+        import jax
+        import flax
+    except ImportError:
+        if error:
+            raise ImportError(
+                "Could not import JAX! RLlib requires you to "
+                "install at least one deep-learning framework: "
+                "`pip install [torch|tensorflow|jax]`."
+            )
+        return None, None
+
+    return jax, flax
+
+
+@PublicAPI
+def try_import_tf(error: bool = False):
+    """Tries importing tf and returns the module (or None).
+
+    Args:
+        error: Whether to raise an error if tf cannot be imported.
+
+    Returns:
+        Tuple containing
+        1) tf1.x module (either from tf2.x.compat.v1 OR as tf1.x).
+        2) tf module (resulting from `import tensorflow`). Either tf1.x or
+        2.x. 3) The actually installed tf version as int: 1 or 2.
+
+    Raises:
+        ImportError: If error=True and tf is not installed.
+    """
+    tf_stub = _TFStub()
+    # Make sure, these are reset after each test case
+    # that uses them: del os.environ["RLLIB_TEST_NO_TF_IMPORT"]
+    if "RLLIB_TEST_NO_TF_IMPORT" in os.environ:
+        logger.warning("Not importing TensorFlow for test purposes")
+        return None, tf_stub, None
+
+    if "TF_CPP_MIN_LOG_LEVEL" not in os.environ:
+        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+
+    # Try to reuse already imported tf module. This will avoid going through
+    # the initial import steps below and thereby switching off v2_behavior
+    # (switching off v2 behavior twice breaks all-framework tests for eager).
+    was_imported = False
+    if "tensorflow" in sys.modules:
+        tf_module = sys.modules["tensorflow"]
+        was_imported = True
+
+    else:
+        try:
+            import tensorflow as tf_module
+        except ImportError:
+            if error:
+                raise ImportError(
+                    "Could not import TensorFlow! RLlib requires you to "
+                    "install at least one deep-learning framework: "
+                    "`pip install [torch|tensorflow|jax]`."
+                )
+            return None, tf_stub, None
+
+    # Try "reducing" tf to tf.compat.v1.
+    try:
+        tf1_module = tf_module.compat.v1
+        tf1_module.logging.set_verbosity(tf1_module.logging.ERROR)
+        if not was_imported:
+            tf1_module.disable_v2_behavior()
+            tf1_module.enable_resource_variables()
+        tf1_module.logging.set_verbosity(tf1_module.logging.WARN)
+    # No compat.v1 -> return tf as is.
+    except AttributeError:
+        tf1_module = tf_module
+
+    if not hasattr(tf_module, "__version__"):
+        version = 1  # sphinx doc gen
+    else:
+        version = 2 if "2." in tf_module.__version__[:2] else 1
+
+    return tf1_module, tf_module, version
+
+
+# Fake module for tf.
+class _TFStub:
+    def __init__(self) -> None:
+        self.keras = _KerasStub()
+
+    def __bool__(self):
+        # if tf should return False
+        return False
+
+
+# Fake module for tf.keras.
+class _KerasStub:
+    def __init__(self) -> None:
+        self.Model = _FakeTfClassStub
+
+
+# Fake classes under keras (e.g for tf.keras.Model)
+class _FakeTfClassStub:
+    def __init__(self, *a, **kw):
+        raise ImportError("Could not import `tensorflow`. Try pip install tensorflow.")
+
+
+@DeveloperAPI
+def tf_function(tf_module):
+    """Conditional decorator for @tf.function.
+
+    Use @tf_function(tf) instead to avoid errors if tf is not installed."""
+
+    # The actual decorator to use (pass in `tf` (which could be None)).
+    def decorator(func):
+        # If tf not installed -> return function as is (won't be used anyways).
+        if tf_module is None or tf_module.executing_eagerly():
+            return func
+        # If tf installed, return @tf.function-decorated function.
+        return tf_module.function(func)
+
+    return decorator
+
+
+@PublicAPI
+def try_import_tfp(error: bool = False):
+    """Tries importing tfp and returns the module (or None).
+
+    Args:
+        error: Whether to raise an error if tfp cannot be imported.
+
+    Returns:
+        The tfp module.
+
+    Raises:
+        ImportError: If error=True and tfp is not installed.
+    """
+    if "RLLIB_TEST_NO_TF_IMPORT" in os.environ:
+        logger.warning("Not importing TensorFlow Probability for test purposes.")
+        return None
+
+    try:
+        import tensorflow_probability as tfp
+
+        return tfp
+    except ImportError as e:
+        if error:
+            raise e
+        return None
+
+
+# Fake module for torch.nn.
+class _NNStub:
+    def __init__(self, *a, **kw):
+        # Fake nn.functional module within torch.nn.
+        self.functional = None
+        self.Module = _FakeTorchClassStub
+        self.parallel = _ParallelStub()
+
+
+# Fake class for e.g. torch.nn.Module to allow it to be inherited from.
+class _FakeTorchClassStub:
+    def __init__(self, *a, **kw):
+        raise ImportError("Could not import `torch`. Try pip install torch.")
+
+
+class _ParallelStub:
+    def __init__(self, *a, **kw):
+        self.DataParallel = _FakeTorchClassStub
+        self.DistributedDataParallel = _FakeTorchClassStub
+
+
+@PublicAPI
+def try_import_torch(error: bool = False):
+    """Tries importing torch and returns the module (or None).
+
+    Args:
+        error: Whether to raise an error if torch cannot be imported.
+
+    Returns:
+        Tuple consisting of the torch- AND torch.nn modules.
+
+    Raises:
+        ImportError: If error=True and PyTorch is not installed.
+    """
+    if "RLLIB_TEST_NO_TORCH_IMPORT" in os.environ:
+        logger.warning("Not importing PyTorch for test purposes.")
+        return _torch_stubs()
+
+    try:
+        import torch
+        import torch.nn as nn
+
+        return torch, nn
+    except ImportError:
+        if error:
+            raise ImportError(
+                "Could not import PyTorch! RLlib requires you to "
+                "install at least one deep-learning framework: "
+                "`pip install [torch|tensorflow|jax]`."
+            )
+        return _torch_stubs()
+
+
+def _torch_stubs():
+    nn = _NNStub()
+    return None, nn
+
+
+@DeveloperAPI
+def get_variable(
+    value: Any,
+    framework: str = "tf",
+    trainable: bool = False,
+    tf_name: str = "unnamed-variable",
+    torch_tensor: bool = False,
+    device: Optional[str] = None,
+    shape: Optional[TensorShape] = None,
+    dtype: Optional[TensorType] = None,
+) -> Any:
+    """Creates a tf variable, a torch tensor, or a python primitive.
+
+    Args:
+        value: The initial value to use. In the non-tf case, this will
+            be returned as is. In the tf case, this could be a tf-Initializer
+            object.
+        framework: One of "tf", "torch", or None.
+        trainable: Whether the generated variable should be
+            trainable (tf)/require_grad (torch) or not (default: False).
+        tf_name: For framework="tf": An optional name for the
+            tf.Variable.
+        torch_tensor: For framework="torch": Whether to actually create
+            a torch.tensor, or just a python value (default).
+        device: An optional torch device to use for
+            the created torch tensor.
+        shape: An optional shape to use iff `value`
+            does not have any (e.g. if it's an initializer w/o explicit value).
+        dtype: An optional dtype to use iff `value` does
+            not have any (e.g. if it's an initializer w/o explicit value).
+            This should always be a numpy dtype (e.g. np.float32, np.int64).
+
+    Returns:
+        A framework-specific variable (tf.Variable, torch.tensor, or
+        python primitive).
+    """
+    if framework in ["tf2", "tf"]:
+        import tensorflow as tf
+
+        dtype = dtype or getattr(
+            value,
+            "dtype",
+            tf.float32
+            if isinstance(value, float)
+            else tf.int32
+            if isinstance(value, int)
+            else None,
+        )
+        return tf.compat.v1.get_variable(
+            tf_name,
+            initializer=value,
+            dtype=dtype,
+            trainable=trainable,
+            **({} if shape is None else {"shape": shape}),
+        )
+    elif framework == "torch" and torch_tensor is True:
+        torch, _ = try_import_torch()
+        if not isinstance(value, np.ndarray):
+            value = np.array(value)
+        var_ = torch.from_numpy(value)
+        if dtype in [torch.float32, np.float32]:
+            var_ = var_.float()
+        elif dtype in [torch.int32, np.int32]:
+            var_ = var_.int()
+        elif dtype in [torch.float64, np.float64]:
+            var_ = var_.double()
+
+        if device:
+            var_ = var_.to(device)
+        var_.requires_grad = trainable
+        return var_
+    # torch or None: Return python primitive.
+    return value
+
+
+@Deprecated(
+    old="rllib/utils/framework.py::get_activation_fn",
+    new="rllib/models/utils.py::get_activation_fn",
+    error=True,
+)
+def get_activation_fn(name: Optional[str] = None, framework: str = "tf"):
+    pass
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/from_config.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/from_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..522ba8dd28783f93d41cf257507808f38259c5a4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/from_config.py
@@ -0,0 +1,325 @@
+from copy import deepcopy
+from functools import partial
+import importlib
+import json
+import os
+import re
+import yaml
+
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils import force_list, merge_dicts
+
+
+@DeveloperAPI
+def from_config(cls, config=None, **kwargs):
+    """Uses the given config to create an object.
+
+    If `config` is a dict, an optional "type" key can be used as a
+    "constructor hint" to specify a certain class of the object.
+    If `config` is not a dict, `config`'s value is used directly as this
+    "constructor hint".
+
+    The rest of `config` (if it's a dict) will be used as kwargs for the
+    constructor. Additional keys in **kwargs will always have precedence
+    (overwrite keys in `config` (if a dict)).
+    Also, if the config-dict or **kwargs contains the special key "_args",
+    it will be popped from the dict and used as *args list to be passed
+    separately to the constructor.
+
+    The following constructor hints are valid:
+    - None: Use `cls` as constructor.
+    - An already instantiated object: Will be returned as is; no
+        constructor call.
+    - A string or an object that is a key in `cls`'s `__type_registry__`
+        dict: The value in `__type_registry__` for that key will be used
+        as the constructor.
+    - A python callable: Use that very callable as constructor.
+    - A string: Either a json/yaml filename or the name of a python
+        module+class (e.g. "ray.rllib. [...] .[some class name]")
+
+    Args:
+        cls: The class to build an instance for (from `config`).
+        config (Optional[dict, str]): The config dict or type-string or
+            filename.
+
+    Keyword Args:
+        kwargs: Optional possibility to pass the constructor arguments in
+            here and use `config` as the type-only info. Then we can call
+            this like: from_config([type]?, [**kwargs for constructor])
+            If `config` is already a dict, then `kwargs` will be merged
+            with `config` (overwriting keys in `config`) after "type" has
+            been popped out of `config`.
+            If a constructor of a Configurable needs *args, the special
+            key `_args` can be passed inside `kwargs` with a list value
+            (e.g. kwargs={"_args": [arg1, arg2, arg3]}).
+
+    Returns:
+        any: The object generated from the config.
+    """
+    # `cls` is the config (config is None).
+    if config is None and isinstance(cls, (dict, str)):
+        config = cls
+        cls = None
+    # `config` is already a created object of this class ->
+    # Take it as is.
+    elif isinstance(cls, type) and isinstance(config, cls):
+        return config
+
+    # `type_`: Indicator for the Configurable's constructor.
+    # `ctor_args`: *args arguments for the constructor.
+    # `ctor_kwargs`: **kwargs arguments for the constructor.
+    # Try to copy, so caller can reuse safely.
+    try:
+        config = deepcopy(config)
+    except Exception:
+        pass
+    if isinstance(config, dict):
+        type_ = config.pop("type", None)
+        if type_ is None and isinstance(cls, str):
+            type_ = cls
+        ctor_kwargs = config
+        # Give kwargs priority over things defined in config dict.
+        # This way, one can pass a generic `spec` and then override single
+        # constructor parameters via the kwargs in the call to `from_config`.
+        ctor_kwargs.update(kwargs)
+    else:
+        type_ = config
+        if type_ is None and "type" in kwargs:
+            type_ = kwargs.pop("type")
+        ctor_kwargs = kwargs
+    # Special `_args` field in kwargs for *args-utilizing constructors.
+    ctor_args = force_list(ctor_kwargs.pop("_args", []))
+
+    # Figure out the actual constructor (class) from `type_`.
+    # None: Try __default__object (if no args/kwargs), only then
+    # constructor of cls (using args/kwargs).
+    if type_ is None:
+        # We have a default constructor that was defined directly by cls
+        # (not by its children).
+        if (
+            cls is not None
+            and hasattr(cls, "__default_constructor__")
+            and cls.__default_constructor__ is not None
+            and ctor_args == []
+            and (
+                not hasattr(cls.__bases__[0], "__default_constructor__")
+                or cls.__bases__[0].__default_constructor__ is None
+                or cls.__bases__[0].__default_constructor__
+                is not cls.__default_constructor__
+            )
+        ):
+            constructor = cls.__default_constructor__
+            # Default constructor's keywords into ctor_kwargs.
+            if isinstance(constructor, partial):
+                kwargs = merge_dicts(ctor_kwargs, constructor.keywords)
+                constructor = partial(constructor.func, **kwargs)
+                ctor_kwargs = {}  # erase to avoid duplicate kwarg error
+        # No default constructor -> Try cls itself as constructor.
+        else:
+            constructor = cls
+    # Try the __type_registry__ of this class.
+    else:
+        constructor = _lookup_type(cls, type_)
+
+        # Found in cls.__type_registry__.
+        if constructor is not None:
+            pass
+        # type_ is False or None (and this value is not registered) ->
+        # return value of type_.
+        elif type_ is False or type_ is None:
+            return type_
+        # Python callable.
+        elif callable(type_):
+            constructor = type_
+        # A string: Filename or a python module+class or a json/yaml str.
+        elif isinstance(type_, str):
+            if re.search("\\.(yaml|yml|json)$", type_):
+                return from_file(cls, type_, *ctor_args, **ctor_kwargs)
+            # Try un-json/un-yaml'ing the string into a dict.
+            obj = yaml.safe_load(type_)
+            if isinstance(obj, dict):
+                return from_config(cls, obj)
+            try:
+                obj = from_config(cls, json.loads(type_))
+            except json.JSONDecodeError:
+                pass
+            else:
+                return obj
+
+            # Test for absolute module.class path specifier.
+            if type_.find(".") != -1:
+                module_name, function_name = type_.rsplit(".", 1)
+                try:
+                    module = importlib.import_module(module_name)
+                    constructor = getattr(module, function_name)
+                # Module not found.
+                except (ModuleNotFoundError, ImportError, AttributeError):
+                    pass
+
+            # If constructor still not found, try attaching cls' module,
+            # then look for type_ in there.
+            if constructor is None:
+                if isinstance(cls, str):
+                    # Module found, but doesn't have the specified
+                    # c'tor/function.
+                    raise ValueError(
+                        f"Full classpath specifier ({type_}) must be a valid "
+                        "full [module].[class] string! E.g.: "
+                        "`my.cool.module.MyCoolClass`."
+                    )
+
+                try:
+                    module = importlib.import_module(cls.__module__)
+                    constructor = getattr(module, type_)
+                except (ModuleNotFoundError, ImportError, AttributeError):
+                    # Try the package as well.
+                    try:
+                        package_name = importlib.import_module(
+                            cls.__module__
+                        ).__package__
+                        module = __import__(package_name, fromlist=[type_])
+                        constructor = getattr(module, type_)
+                    except (ModuleNotFoundError, ImportError, AttributeError):
+                        pass
+
+            if constructor is None:
+                raise ValueError(
+                    f"String specifier ({type_}) must be a valid filename, "
+                    f"a [module].[class], a class within '{cls.__module__}', "
+                    f"or a key into {cls.__name__}.__type_registry__!"
+                )
+
+    if not constructor:
+        raise TypeError("Invalid type '{}'. Cannot create `from_config`.".format(type_))
+
+    # Create object with inferred constructor.
+    try:
+        object_ = constructor(*ctor_args, **ctor_kwargs)
+    # Catch attempts to construct from an abstract class and return None.
+    except TypeError as e:
+        if re.match("Can't instantiate abstract class", e.args[0]):
+            return None
+        raise e  # Re-raise
+    # No sanity check for fake (lambda)-"constructors".
+    if type(constructor).__name__ != "function":
+        assert isinstance(
+            object_,
+            constructor.func if isinstance(constructor, partial) else constructor,
+        )
+
+    return object_
+
+
+@DeveloperAPI
+def from_file(cls, filename, *args, **kwargs):
+    """
+    Create object from config saved in filename. Expects json or yaml file.
+
+    Args:
+        filename: File containing the config (json or yaml).
+
+    Returns:
+        any: The object generated from the file.
+    """
+    path = os.path.join(os.getcwd(), filename)
+    if not os.path.isfile(path):
+        raise FileNotFoundError("File '{}' not found!".format(filename))
+
+    with open(path, "rt") as fp:
+        if path.endswith(".yaml") or path.endswith(".yml"):
+            config = yaml.safe_load(fp)
+        else:
+            config = json.load(fp)
+
+    # Add possible *args.
+    config["_args"] = args
+    return from_config(cls, config=config, **kwargs)
+
+
+def _lookup_type(cls, type_):
+    if (
+        cls is not None
+        and hasattr(cls, "__type_registry__")
+        and isinstance(cls.__type_registry__, dict)
+        and (
+            type_ in cls.__type_registry__
+            or (
+                isinstance(type_, str)
+                and re.sub("[\\W_]", "", type_.lower()) in cls.__type_registry__
+            )
+        )
+    ):
+        available_class_for_type = cls.__type_registry__.get(type_)
+        if available_class_for_type is None:
+            available_class_for_type = cls.__type_registry__[
+                re.sub("[\\W_]", "", type_.lower())
+            ]
+        return available_class_for_type
+    return None
+
+
+class _NotProvided:
+    """Singleton class to provide a "not provided" value for AlgorithmConfig signatures.
+
+    Using the only instance of this class indicates that the user does NOT wish to
+    change the value of some property.
+
+    .. testcode::
+        :skipif: True
+
+        from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+        config = AlgorithmConfig()
+        # Print out the default learning rate.
+        print(config.lr)
+
+    .. testoutput::
+
+        0.001
+
+    .. testcode::
+        :skipif: True
+
+        # Print out the default `preprocessor_pref`.
+        print(config.preprocessor_pref)
+
+    .. testoutput::
+
+        "deepmind"
+
+    .. testcode::
+        :skipif: True
+
+        # Will only set the `preprocessor_pref` property (to None) and leave
+        # all other properties at their default values.
+        config.training(preprocessor_pref=None)
+        config.preprocessor_pref is None
+
+    .. testoutput::
+
+        True
+
+    .. testcode::
+        :skipif: True
+
+        # Still the same value (didn't touch it in the call to `.training()`.
+        print(config.lr)
+
+    .. testoutput::
+
+        0.001
+    """
+
+    class __NotProvided:
+        pass
+
+    instance = None
+
+    def __init__(self):
+        if _NotProvided.instance is None:
+            _NotProvided.instance = _NotProvided.__NotProvided()
+
+
+# Use this object as default values in all method signatures of
+# AlgorithmConfig, indicating that the respective property should NOT be touched
+# in the call.
+NotProvided = _NotProvided()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/images.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/images.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b0f1601d574a5e34805b30cfed7ca9c391c1f0c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/images.py
@@ -0,0 +1,60 @@
+import logging
+import importlib
+
+import numpy as np
+
+from ray.rllib.utils.annotations import DeveloperAPI
+
+logger = logging.getLogger(__name__)
+
+
+@DeveloperAPI
+def is_package_installed(package_name):
+    try:
+        importlib.metadata.version(package_name)
+        return True
+    except importlib.metadata.PackageNotFoundError:
+        return False
+
+
+try:
+    import cv2
+
+    cv2.ocl.setUseOpenCL(False)
+
+    logger.debug("CV2 found for image processing.")
+except ImportError as e:
+    if is_package_installed("opencv-python"):
+        raise ImportError(
+            f"OpenCV is installed, but we failed to import it. This may be because "
+            f"you need to install `opencv-python-headless` instead of "
+            f"`opencv-python`. Error message: {e}",
+        )
+    cv2 = None
+
+
+@DeveloperAPI
+def resize(img: np.ndarray, height: int, width: int) -> np.ndarray:
+    if not cv2:
+        raise ModuleNotFoundError(
+            "`opencv` not installed! Do `pip install opencv-python`"
+        )
+    return cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
+
+
+@DeveloperAPI
+def rgb2gray(img: np.ndarray) -> np.ndarray:
+    if not cv2:
+        raise ModuleNotFoundError(
+            "`opencv` not installed! Do `pip install opencv-python`"
+        )
+    return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+
+
+@DeveloperAPI
+def imread(img_file: str) -> np.ndarray:
+    if not cv2:
+        raise ModuleNotFoundError(
+            "`opencv` not installed! Do `pip install opencv-python`"
+        )
+    return cv2.imread(img_file).astype(np.float32)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/lambda_defaultdict.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/lambda_defaultdict.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce4653961c565304570da87eea9066dc366643e0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/lambda_defaultdict.py
@@ -0,0 +1,52 @@
+from collections import defaultdict
+from typing import Any, Callable
+
+
+class LambdaDefaultDict(defaultdict):
+    """A defaultdict that creates default values based on the associated key.
+
+    Note that the standard defaultdict can only produce default values (via its factory)
+    that are independent of the key under which they are stored.
+    As opposed to that, the lambda functions used as factories for this
+    `LambdaDefaultDict` class do accept a single argument: The missing key.
+    If a missing key is accessed by the user, the provided lambda function is called
+    with this missing key as its argument. The returned value is stored in the
+    dictionary under that key and returned.
+
+    Example:
+
+        In this example, if you try to access a key that doesn't exist, it will call
+        the lambda function, passing it the missing key. The function will return a
+        string, which will be stored in the dictionary under that key.
+
+        .. testcode::
+
+            from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict
+
+            default_dict = LambdaDefaultDict(lambda missing_key: f"Value for {missing_key}")
+            print(default_dict["a"])
+
+        .. testoutput::
+
+            Value for a
+    """  # noqa: E501
+
+    def __init__(self, default_factory: Callable[[str], Any], *args, **kwargs):
+        """Initializes a LambdaDefaultDict instance.
+
+        Args:
+            default_factory: The default factory callable, taking a string (key)
+                and returning the default value to use for that key.
+        """
+        if not callable(default_factory):
+            raise TypeError("First argument must be a Callable!")
+
+        # We will handle the factory in __missing__ method.
+        super().__init__(None, *args, **kwargs)
+
+        self.default_factory = default_factory
+
+    def __missing__(self, key):
+        # Call default factory with the key as argument.
+        self[key] = value = self.default_factory(key)
+        return value
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/memory.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe739cc0f99b8f23f68f31021b446cbf06f64d17
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/memory.py
@@ -0,0 +1,8 @@
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.rllib.utils.numpy import aligned_array, concat_aligned  # noqa
+
+deprecation_warning(
+    old="ray.rllib.utils.memory.[...]",
+    new="ray.rllib.utils.numpy.[...]",
+    error=True,
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/numpy.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..831a4fbcf5365cae130d638569cad20454e5e9fd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/numpy.py
@@ -0,0 +1,606 @@
+from collections import OrderedDict
+from gymnasium.spaces import Discrete, MultiDiscrete
+import numpy as np
+import tree  # pip install dm_tree
+from types import MappingProxyType
+from typing import List, Optional
+
+
+from ray.rllib.utils.annotations import PublicAPI
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.typing import SpaceStruct, TensorType, TensorStructType, Union
+
+tf1, tf, tfv = try_import_tf()
+torch, _ = try_import_torch()
+
+SMALL_NUMBER = 1e-6
+# Some large int number. May be increased here, if needed.
+LARGE_INTEGER = 100000000
+# Min and Max outputs (clipped) from an NN-output layer interpreted as the
+# log(x) of some x (e.g. a stddev of a normal
+# distribution).
+MIN_LOG_NN_OUTPUT = -5
+MAX_LOG_NN_OUTPUT = 2
+
+
+@PublicAPI
+@Deprecated(
+    help="RLlib itself has no use for this anymore.",
+    error=False,
+)
+def aligned_array(size: int, dtype, align: int = 64) -> np.ndarray:
+    """Returns an array of a given size that is 64-byte aligned.
+
+    The returned array can be efficiently copied into GPU memory by TensorFlow.
+
+    Args:
+        size: The size (total number of items) of the array. For example,
+            array([[0.0, 1.0], [2.0, 3.0]]) would have size=4.
+        dtype: The numpy dtype of the array.
+        align: The alignment to use.
+
+    Returns:
+        A np.ndarray with the given specifications.
+    """
+    n = size * dtype.itemsize
+    empty = np.empty(n + (align - 1), dtype=np.uint8)
+    data_align = empty.ctypes.data % align
+    offset = 0 if data_align == 0 else (align - data_align)
+    if n == 0:
+        # stop np from optimising out empty slice reference
+        output = empty[offset : offset + 1][0:0].view(dtype)
+    else:
+        output = empty[offset : offset + n].view(dtype)
+
+    assert len(output) == size, len(output)
+    assert output.ctypes.data % align == 0, output.ctypes.data
+    return output
+
+
+@PublicAPI
+@Deprecated(
+    help="RLlib itself has no use for this anymore.",
+    error=False,
+)
+def concat_aligned(
+    items: List[np.ndarray], time_major: Optional[bool] = None
+) -> np.ndarray:
+    """Concatenate arrays, ensuring the output is 64-byte aligned.
+
+    We only align float arrays; other arrays are concatenated as normal.
+
+    This should be used instead of np.concatenate() to improve performance
+    when the output array is likely to be fed into TensorFlow.
+
+    Args:
+        items: The list of items to concatenate and align.
+        time_major: Whether the data in items is time-major, in which
+            case, we will concatenate along axis=1.
+
+    Returns:
+        The concat'd and aligned array.
+    """
+
+    if len(items) == 0:
+        return []
+    elif len(items) == 1:
+        # we assume the input is aligned. In any case, it doesn't help
+        # performance to force align it since that incurs a needless copy.
+        return items[0]
+    elif isinstance(items[0], np.ndarray) and items[0].dtype in [
+        np.float32,
+        np.float64,
+        np.uint8,
+    ]:
+        dtype = items[0].dtype
+        flat = aligned_array(sum(s.size for s in items), dtype)
+        if time_major is not None:
+            if time_major is True:
+                batch_dim = sum(s.shape[1] for s in items)
+                new_shape = (items[0].shape[0], batch_dim,) + items[
+                    0
+                ].shape[2:]
+            else:
+                batch_dim = sum(s.shape[0] for s in items)
+                new_shape = (batch_dim, items[0].shape[1],) + items[
+                    0
+                ].shape[2:]
+        else:
+            batch_dim = sum(s.shape[0] for s in items)
+            new_shape = (batch_dim,) + items[0].shape[1:]
+        output = flat.reshape(new_shape)
+        assert output.ctypes.data % 64 == 0, output.ctypes.data
+        np.concatenate(items, out=output, axis=1 if time_major else 0)
+        return output
+    else:
+        return np.concatenate(items, axis=1 if time_major else 0)
+
+
+@PublicAPI
+def convert_to_numpy(x: TensorStructType, reduce_type: bool = True) -> TensorStructType:
+    """Converts values in `stats` to non-Tensor numpy or python types.
+
+    Args:
+        x: Any (possibly nested) struct, the values in which will be
+            converted and returned as a new struct with all torch/tf tensors
+            being converted to numpy types.
+        reduce_type: Whether to automatically reduce all float64 and int64 data
+            into float32 and int32 data, respectively.
+
+    Returns:
+        A new struct with the same structure as `x`, but with all
+        values converted to numpy arrays (on CPU).
+    """
+
+    # The mapping function used to numpyize torch/tf Tensors (and move them
+    # to the CPU beforehand).
+    def mapping(item):
+        if torch and isinstance(item, torch.Tensor):
+            ret = (
+                item.cpu().item()
+                if len(item.size()) == 0
+                else item.detach().cpu().numpy()
+            )
+        elif (
+            tf and isinstance(item, (tf.Tensor, tf.Variable)) and hasattr(item, "numpy")
+        ):
+            assert tf.executing_eagerly()
+            ret = item.numpy()
+        else:
+            ret = item
+        if reduce_type and isinstance(ret, np.ndarray):
+            if np.issubdtype(ret.dtype, np.floating):
+                ret = ret.astype(np.float32)
+            elif np.issubdtype(ret.dtype, int):
+                ret = ret.astype(np.int32)
+            return ret
+        return ret
+
+    return tree.map_structure(mapping, x)
+
+
+@PublicAPI
+def fc(
+    x: np.ndarray,
+    weights: np.ndarray,
+    biases: Optional[np.ndarray] = None,
+    framework: Optional[str] = None,
+) -> np.ndarray:
+    """Calculates FC (dense) layer outputs given weights/biases and input.
+
+    Args:
+        x: The input to the dense layer.
+        weights: The weights matrix.
+        biases: The biases vector. All 0s if None.
+        framework: An optional framework hint (to figure out,
+            e.g. whether to transpose torch weight matrices).
+
+    Returns:
+        The dense layer's output.
+    """
+
+    def map_(data, transpose=False):
+        if torch:
+            if isinstance(data, torch.Tensor):
+                data = data.cpu().detach().numpy()
+        if tf and tf.executing_eagerly():
+            if isinstance(data, tf.Variable):
+                data = data.numpy()
+        if transpose:
+            data = np.transpose(data)
+        return data
+
+    x = map_(x)
+    # Torch stores matrices in transpose (faster for backprop).
+    transpose = framework == "torch" and (
+        x.shape[1] != weights.shape[0] and x.shape[1] == weights.shape[1]
+    )
+    weights = map_(weights, transpose=transpose)
+    biases = map_(biases)
+
+    return np.matmul(x, weights) + (0.0 if biases is None else biases)
+
+
+@PublicAPI
+def flatten_inputs_to_1d_tensor(
+    inputs: TensorStructType,
+    spaces_struct: Optional[SpaceStruct] = None,
+    time_axis: bool = False,
+    batch_axis: bool = True,
+) -> TensorType:
+    """Flattens arbitrary input structs according to the given spaces struct.
+
+    Returns a single 1D tensor resulting from the different input
+    components' values.
+
+    Thereby:
+    - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes
+    are not treated differently from other types of Boxes and get
+    flattened as well.
+    - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with
+    Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]].
+    - MultiDiscrete values are multi-one-hot'd, e.g. a batch of
+    [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in
+    [[1, 0,  0, 0, 1, 0, 0], [0, 1,  0, 0, 0, 0, 1]].
+
+    Args:
+        inputs: The inputs to be flattened.
+        spaces_struct: The (possibly nested) structure of the spaces that `inputs`
+            belongs to.
+        time_axis: Whether all inputs have a time-axis (after the batch axis).
+            If True, will keep not only the batch axis (0th), but the time axis
+            (1st) as-is and flatten everything from the 2nd axis up.
+        batch_axis: Whether all inputs have a batch axis.
+            If True, will keep that batch axis as-is and flatten everything from the
+            other dims up.
+
+    Returns:
+        A single 1D tensor resulting from concatenating all
+        flattened/one-hot'd input components. Depending on the time_axis flag,
+        the shape is (B, n) or (B, T, n).
+
+    .. testcode::
+        :skipif: True
+
+        # B=2
+        from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor
+        from gymnasium.spaces import Discrete, Box
+        out = flatten_inputs_to_1d_tensor(
+            {"a": [1, 0], "b": [[[0.0], [0.1]], [1.0], [1.1]]},
+            spaces_struct=dict(a=Discrete(2), b=Box(shape=(2, 1)))
+        )
+        print(out)
+
+        # B=2; T=2
+        out = flatten_inputs_to_1d_tensor(
+            ([[1, 0], [0, 1]],
+             [[[0.0, 0.1], [1.0, 1.1]], [[2.0, 2.1], [3.0, 3.1]]]),
+            spaces_struct=tuple([Discrete(2), Box(shape=(2, ))]),
+            time_axis=True
+        )
+        print(out)
+
+    .. testoutput::
+
+        [[0.0, 1.0,  0.0, 0.1], [1.0, 0.0,  1.0, 1.1]]  # B=2 n=4
+        [[[0.0, 1.0, 0.0, 0.1], [1.0, 0.0, 1.0, 1.1]],
+        [[1.0, 0.0, 2.0, 2.1], [0.0, 1.0, 3.0, 3.1]]]  # B=2 T=2 n=4
+    """
+    # `time_axis` must not be True if `batch_axis` is False.
+    assert not (time_axis and not batch_axis)
+
+    flat_inputs = tree.flatten(inputs)
+    flat_spaces = (
+        tree.flatten(spaces_struct)
+        if spaces_struct is not None
+        else [None] * len(flat_inputs)
+    )
+
+    B = None
+    T = None
+    out = []
+    for input_, space in zip(flat_inputs, flat_spaces):
+        # Store batch and (if applicable) time dimension.
+        if B is None and batch_axis:
+            B = input_.shape[0]
+            if time_axis:
+                T = input_.shape[1]
+
+        # One-hot encoding.
+        if isinstance(space, Discrete):
+            if time_axis:
+                input_ = np.reshape(input_, [B * T])
+            out.append(one_hot(input_, depth=space.n).astype(np.float32))
+        # Multi one-hot encoding.
+        elif isinstance(space, MultiDiscrete):
+            if time_axis:
+                input_ = np.reshape(input_, [B * T, -1])
+            if batch_axis:
+                out.append(
+                    np.concatenate(
+                        [
+                            one_hot(input_[:, i], depth=n).astype(np.float32)
+                            for i, n in enumerate(space.nvec)
+                        ],
+                        axis=-1,
+                    )
+                )
+            else:
+                out.append(
+                    np.concatenate(
+                        [
+                            one_hot(input_[i], depth=n).astype(np.float32)
+                            for i, n in enumerate(space.nvec)
+                        ],
+                        axis=-1,
+                    )
+                )
+        # Box: Flatten.
+        else:
+            # Special case for spaces: Box(.., shape=(), ..)
+            if isinstance(input_, float):
+                input_ = np.array([input_])
+
+            if time_axis:
+                input_ = np.reshape(input_, [B * T, -1])
+            elif batch_axis:
+                input_ = np.reshape(input_, [B, -1])
+            else:
+                input_ = np.reshape(input_, [-1])
+            out.append(input_.astype(np.float32))
+
+    merged = np.concatenate(out, axis=-1)
+    # Restore the time-dimension, if applicable.
+    if time_axis:
+        merged = np.reshape(merged, [B, T, -1])
+    return merged
+
+
+@PublicAPI
+def make_action_immutable(obj):
+    """Flags actions immutable to notify users when trying to change them.
+
+    Can also be used with any tree-like structure containing either
+    dictionaries, numpy arrays or already immutable objects per se.
+    Note, however that `tree.map_structure()` will in general not
+    include the shallow object containing all others and therefore
+    immutability will hold only for all objects contained in it.
+    Use `tree.traverse(fun, action, top_down=False)` to include
+    also the containing object.
+
+    Args:
+        obj: The object to be made immutable.
+
+    Returns:
+        The immutable object.
+
+    .. testcode::
+        :skipif: True
+
+        import tree
+        import numpy as np
+        from ray.rllib.utils.numpy import make_action_immutable
+        arr = np.arange(1,10)
+        d = dict(a = 1, b = (arr, arr))
+        tree.traverse(make_action_immutable, d, top_down=False)
+    """
+    if isinstance(obj, np.ndarray):
+        obj.setflags(write=False)
+        return obj
+    elif isinstance(obj, OrderedDict):
+        return MappingProxyType(dict(obj))
+    elif isinstance(obj, dict):
+        return MappingProxyType(obj)
+    else:
+        return obj
+
+
+@PublicAPI
+def huber_loss(x: np.ndarray, delta: float = 1.0) -> np.ndarray:
+    """Reference: https://en.wikipedia.org/wiki/Huber_loss."""
+    return np.where(
+        np.abs(x) < delta, np.power(x, 2.0) * 0.5, delta * (np.abs(x) - 0.5 * delta)
+    )
+
+
+@PublicAPI
+def l2_loss(x: np.ndarray) -> np.ndarray:
+    """Computes half the L2 norm of a tensor (w/o the sqrt): sum(x**2) / 2.
+
+    Args:
+        x: The input tensor.
+
+    Returns:
+        The l2-loss output according to the above formula given `x`.
+    """
+    return np.sum(np.square(x)) / 2.0
+
+
+@PublicAPI
+def lstm(
+    x,
+    weights: np.ndarray,
+    biases: Optional[np.ndarray] = None,
+    initial_internal_states: Optional[np.ndarray] = None,
+    time_major: bool = False,
+    forget_bias: float = 1.0,
+):
+    """Calculates LSTM layer output given weights/biases, states, and input.
+
+    Args:
+        x: The inputs to the LSTM layer including time-rank
+            (0th if time-major, else 1st) and the batch-rank
+            (1st if time-major, else 0th).
+        weights: The weights matrix.
+        biases: The biases vector. All 0s if None.
+        initial_internal_states: The initial internal
+            states to pass into the layer. All 0s if None.
+        time_major: Whether to use time-major or not. Default: False.
+        forget_bias: Gets added to first sigmoid (forget gate) output.
+            Default: 1.0.
+
+    Returns:
+        Tuple consisting of 1) The LSTM layer's output and
+        2) Tuple: Last (c-state, h-state).
+    """
+    sequence_length = x.shape[0 if time_major else 1]
+    batch_size = x.shape[1 if time_major else 0]
+    units = weights.shape[1] // 4  # 4 internal layers (3x sigmoid, 1x tanh)
+
+    if initial_internal_states is None:
+        c_states = np.zeros(shape=(batch_size, units))
+        h_states = np.zeros(shape=(batch_size, units))
+    else:
+        c_states = initial_internal_states[0]
+        h_states = initial_internal_states[1]
+
+    # Create a placeholder for all n-time step outputs.
+    if time_major:
+        unrolled_outputs = np.zeros(shape=(sequence_length, batch_size, units))
+    else:
+        unrolled_outputs = np.zeros(shape=(batch_size, sequence_length, units))
+
+    # Push the batch 4 times through the LSTM cell and capture the outputs plus
+    # the final h- and c-states.
+    for t in range(sequence_length):
+        input_matrix = x[t, :, :] if time_major else x[:, t, :]
+        input_matrix = np.concatenate((input_matrix, h_states), axis=1)
+        input_matmul_matrix = np.matmul(input_matrix, weights) + biases
+        # Forget gate (3rd slot in tf output matrix). Add static forget bias.
+        sigmoid_1 = sigmoid(input_matmul_matrix[:, units * 2 : units * 3] + forget_bias)
+        c_states = np.multiply(c_states, sigmoid_1)
+        # Add gate (1st and 2nd slots in tf output matrix).
+        sigmoid_2 = sigmoid(input_matmul_matrix[:, 0:units])
+        tanh_3 = np.tanh(input_matmul_matrix[:, units : units * 2])
+        c_states = np.add(c_states, np.multiply(sigmoid_2, tanh_3))
+        # Output gate (last slot in tf output matrix).
+        sigmoid_4 = sigmoid(input_matmul_matrix[:, units * 3 : units * 4])
+        h_states = np.multiply(sigmoid_4, np.tanh(c_states))
+
+        # Store this output time-slice.
+        if time_major:
+            unrolled_outputs[t, :, :] = h_states
+        else:
+            unrolled_outputs[:, t, :] = h_states
+
+    return unrolled_outputs, (c_states, h_states)
+
+
+@PublicAPI
+def one_hot(
+    x: Union[TensorType, int],
+    depth: int = 0,
+    on_value: float = 1.0,
+    off_value: float = 0.0,
+    dtype: type = np.float32,
+) -> np.ndarray:
+    """One-hot utility function for numpy.
+
+    Thanks to qianyizhang:
+    https://gist.github.com/qianyizhang/07ee1c15cad08afb03f5de69349efc30.
+
+    Args:
+        x: The input to be one-hot encoded.
+        depth: The max. number to be one-hot encoded (size of last rank).
+        on_value: The value to use for on. Default: 1.0.
+        off_value: The value to use for off. Default: 0.0.
+
+    Returns:
+        The one-hot encoded equivalent of the input array.
+    """
+
+    # Handle simple ints properly.
+    if isinstance(x, int):
+        x = np.array(x, dtype=np.int32)
+    # Handle torch arrays properly.
+    elif torch and isinstance(x, torch.Tensor):
+        x = x.numpy()
+
+    # Handle bool arrays correctly.
+    if x.dtype == np.bool_:
+        x = x.astype(np.int_)
+        depth = 2
+
+    # If depth is not given, try to infer it from the values in the array.
+    if depth == 0:
+        depth = np.max(x) + 1
+    assert (
+        np.max(x) < depth
+    ), "ERROR: The max. index of `x` ({}) is larger than depth ({})!".format(
+        np.max(x), depth
+    )
+    shape = x.shape
+
+    out = np.ones(shape=(*shape, depth)) * off_value
+    indices = []
+    for i in range(x.ndim):
+        tiles = [1] * x.ndim
+        s = [1] * x.ndim
+        s[i] = -1
+        r = np.arange(shape[i]).reshape(s)
+        if i > 0:
+            tiles[i - 1] = shape[i - 1]
+            r = np.tile(r, tiles)
+        indices.append(r)
+    indices.append(x)
+    out[tuple(indices)] = on_value
+    return out.astype(dtype)
+
+
+@PublicAPI
+def one_hot_multidiscrete(x, depths=List[int]):
+    # Handle torch arrays properly.
+    if torch and isinstance(x, torch.Tensor):
+        x = x.numpy()
+
+    shape = x.shape
+    return np.concatenate(
+        [
+            one_hot(x[i] if len(shape) == 1 else x[:, i], depth=n).astype(np.float32)
+            for i, n in enumerate(depths)
+        ],
+        axis=-1,
+    )
+
+
+@PublicAPI
+def relu(x: np.ndarray, alpha: float = 0.0) -> np.ndarray:
+    """Implementation of the leaky ReLU function.
+
+    y = x * alpha if x < 0 else x
+
+    Args:
+        x: The input values.
+        alpha: A scaling ("leak") factor to use for negative x.
+
+    Returns:
+        The leaky ReLU output for x.
+    """
+    return np.maximum(x, x * alpha, x)
+
+
+@PublicAPI
+def sigmoid(x: np.ndarray, derivative: bool = False) -> np.ndarray:
+    """
+    Returns the sigmoid function applied to x.
+    Alternatively, can return the derivative or the sigmoid function.
+
+    Args:
+        x: The input to the sigmoid function.
+        derivative: Whether to return the derivative or not.
+            Default: False.
+
+    Returns:
+        The sigmoid function (or its derivative) applied to x.
+    """
+    if derivative:
+        return x * (1 - x)
+    else:
+        return 1 / (1 + np.exp(-x))
+
+
+@PublicAPI
+def softmax(
+    x: Union[np.ndarray, list], axis: int = -1, epsilon: Optional[float] = None
+) -> np.ndarray:
+    """Returns the softmax values for x.
+
+    The exact formula used is:
+    S(xi) = e^xi / SUMj(e^xj), where j goes over all elements in x.
+
+    Args:
+        x: The input to the softmax function.
+        axis: The axis along which to softmax.
+        epsilon: Optional epsilon as a minimum value. If None, use
+            `SMALL_NUMBER`.
+
+    Returns:
+        The softmax over x.
+    """
+    epsilon = epsilon or SMALL_NUMBER
+    # x_exp = np.maximum(np.exp(x), SMALL_NUMBER)
+    x_exp = np.exp(x)
+    # return x_exp /
+    #   np.maximum(np.sum(x_exp, axis, keepdims=True), SMALL_NUMBER)
+    return np.maximum(x_exp / np.sum(x_exp, axis, keepdims=True), epsilon)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66b1bc37c33b9361ffca7ebacdd3607793c891c2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/episodes.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/episodes.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1225f24a4ea8ca1549802284768388e02b497fb1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/episodes.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/value_predictions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/value_predictions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba4d599bebd0e3a28c88c4c1ffa4740c2dc8a6f0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/value_predictions.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/zero_padding.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/zero_padding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc98e41eeda8ad41e4b0ed6eced8bfff543e4810
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/__pycache__/zero_padding.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/episodes.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/episodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdc5b1321659366372b985116ad3cddcf8b2f899
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/episodes.py
@@ -0,0 +1,144 @@
+from typing import List, Tuple
+
+import numpy as np
+
+from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+from ray.util.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+def add_one_ts_to_episodes_and_truncate(episodes: List[SingleAgentEpisode]):
+    """Adds an artificial timestep to an episode at the end.
+
+    In detail: The last observations, infos, actions, and all `extra_model_outputs`
+    will be duplicated and appended to each episode's data. An extra 0.0 reward
+    will be appended to the episode's rewards. The episode's timestep will be
+    increased by 1. Also, adds the truncated=True flag to each episode if the
+    episode is not already done (terminated or truncated).
+
+    Useful for value function bootstrapping, where it is required to compute a
+    forward pass for the very last timestep within the episode,
+    i.e. using the following input dict: {
+      obs=[final obs],
+      state=[final state output],
+      prev. reward=[final reward],
+      etc..
+    }
+
+    Args:
+        episodes: The list of SingleAgentEpisode objects to extend by one timestep
+            and add a truncation flag if necessary.
+
+    Returns:
+        A list of the original episodes' truncated values (so the episodes can be
+        properly restored later into their original states).
+    """
+    orig_truncateds = []
+    for episode in episodes:
+        orig_truncateds.append(episode.is_truncated)
+
+        # Add timestep.
+        episode.t += 1
+        # Use the episode API that allows appending (possibly complex) structs
+        # to the data.
+        episode.observations.append(episode.observations[-1])
+        episode.infos.append(episode.infos[-1])
+        episode.actions.append(episode.actions[-1])
+        episode.rewards.append(0.0)
+        for v in episode.extra_model_outputs.values():
+            v.append(v[-1])
+        # Artificially make this episode truncated for the upcoming GAE
+        # computations.
+        if not episode.is_done:
+            episode.is_truncated = True
+        # Validate to make sure, everything is in order.
+        episode.validate()
+
+    return orig_truncateds
+
+
+@DeveloperAPI
+def remove_last_ts_from_data(
+    episode_lens: List[int],
+    *data: Tuple[np._typing.NDArray],
+) -> Tuple[np._typing.NDArray]:
+    """Removes the last timesteps from each given data item.
+
+    Each item in data is a concatenated sequence of episodes data.
+    For example if `episode_lens` is [2, 4], then data is a shape=(6,)
+    ndarray. The returned corresponding value will have shape (4,), meaning
+    both episodes have been shortened by exactly one timestep to 1 and 3.
+
+    ..testcode::
+
+        from ray.rllib.algorithms.ppo.ppo_learner import PPOLearner
+        import numpy as np
+
+        unpadded = PPOLearner._remove_last_ts_from_data(
+            [5, 3],
+            np.array([0, 1, 2, 3, 4,  0, 1, 2]),
+        )
+        assert (unpadded[0] == [0, 1, 2, 3, 0, 1]).all()
+
+        unpadded = PPOLearner._remove_last_ts_from_data(
+            [4, 2, 3],
+            np.array([0, 1, 2, 3,  0, 1,  0, 1, 2]),
+            np.array([4, 5, 6, 7,  2, 3,  3, 4, 5]),
+        )
+        assert (unpadded[0] == [0, 1, 2,  0,  0, 1]).all()
+        assert (unpadded[1] == [4, 5, 6,  2,  3, 4]).all()
+
+    Args:
+        episode_lens: A list of current episode lengths. The returned
+            data will have the same lengths minus 1 timestep.
+        data: A tuple of data items (np.ndarrays) representing concatenated episodes
+            to be shortened by one timestep per episode.
+            Note that only arrays with `shape=(n,)` are supported! The
+            returned data will have `shape=(n-len(episode_lens),)` (each
+            episode gets shortened by one timestep).
+
+    Returns:
+        A tuple of new data items shortened by one timestep.
+    """
+    # Figure out the new slices to apply to each data item based on
+    # the given episode_lens.
+    slices = []
+    sum = 0
+    for len_ in episode_lens:
+        slices.append(slice(sum, sum + len_ - 1))
+        sum += len_
+    # Compiling return data by slicing off one timestep at the end of
+    # each episode.
+    ret = []
+    for d in data:
+        ret.append(np.concatenate([d[s] for s in slices]))
+    return tuple(ret) if len(ret) > 1 else ret[0]
+
+
+@DeveloperAPI
+def remove_last_ts_from_episodes_and_restore_truncateds(
+    episodes: List[SingleAgentEpisode],
+    orig_truncateds: List[bool],
+) -> None:
+    """Reverts the effects of `_add_ts_to_episodes_and_truncate`.
+
+    Args:
+        episodes: The list of SingleAgentEpisode objects to extend by one timestep
+            and add a truncation flag if necessary.
+        orig_truncateds: A list of the original episodes' truncated values to be
+            applied to the `episodes`.
+    """
+
+    # Fix all episodes.
+    for episode, orig_truncated in zip(episodes, orig_truncateds):
+        # Reduce timesteps by 1.
+        episode.t -= 1
+        # Remove all extra timestep data from the episode's buffers.
+        episode.observations.pop()
+        episode.infos.pop()
+        episode.actions.pop()
+        episode.rewards.pop()
+        for v in episode.extra_model_outputs.values():
+            v.pop()
+        # Fix the truncateds flag again.
+        episode.is_truncated = orig_truncated
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/value_predictions.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/value_predictions.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c0f88e7f49e9a2969ebdc2fc2887fedcf1f3e32
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/value_predictions.py
@@ -0,0 +1,108 @@
+import numpy as np
+
+from ray.util.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+def compute_value_targets(
+    values,
+    rewards,
+    terminateds,
+    truncateds,
+    gamma: float,
+    lambda_: float,
+):
+    """Computes value function (vf) targets given vf predictions and rewards.
+
+    Note that advantages can then easily be computed via the formula:
+    advantages = targets - vf_predictions
+    """
+    # Force-set all values at terminals (not at truncations!) to 0.0.
+    orig_values = flat_values = values * (1.0 - terminateds)
+
+    flat_values = np.append(flat_values, 0.0)
+    intermediates = rewards + gamma * (1 - lambda_) * flat_values[1:]
+    continues = 1.0 - terminateds
+
+    Rs = []
+    last = flat_values[-1]
+    for t in reversed(range(intermediates.shape[0])):
+        last = intermediates[t] + continues[t] * gamma * lambda_ * last
+        Rs.append(last)
+        if truncateds[t]:
+            last = orig_values[t]
+
+    # Reverse back to correct (time) direction.
+    value_targets = np.stack(list(reversed(Rs)), axis=0)
+
+    return value_targets.astype(np.float32)
+
+
+def extract_bootstrapped_values(vf_preds, episode_lengths, T):
+    """Returns a bootstrapped value batch given value predictions.
+
+    Note that the incoming value predictions must have happened over (artificially)
+    elongated episodes (by 1 timestep at the end). This way, we can either extract the
+    `vf_preds` at these extra timesteps (as "bootstrap values") or skip over them
+    entirely if they lie in the middle of the T-slices.
+
+    For example, given an episodes structure like this:
+    01234a 0123456b 01c 012- 0123e 012-
+    where each episode is separated by a space and goes from 0 to n and ends in an
+    artificially elongated timestep (denoted by 'a', 'b', 'c', '-', or 'e'), where '-'
+    means that the episode was terminated and the bootstrap value at the end should be
+    zero and 'a', 'b', 'c', etc.. represent truncated episode ends with computed vf
+    estimates.
+    The output for the above sequence (and T=4) should then be:
+    4 3 b 2 3 -
+
+    Args:
+        vf_preds: The computed value function predictions over the artificially
+            elongated episodes (by one timestep at the end).
+        episode_lengths: The original (correct) episode lengths, NOT counting the
+            artificially added timestep at the end.
+        T: The size of the time dimension by which to slice the data. Note that the
+            sum of all episode lengths (`sum(episode_lengths)`) must be dividable by T.
+
+    Returns:
+        The batch of bootstrapped values.
+    """
+    bootstrapped_values = []
+    if sum(episode_lengths) % T != 0:
+        raise ValueError(
+            "Can only extract bootstrapped values if the sum of episode lengths "
+            f"({sum(episode_lengths)}) is dividable by the given T ({T})!"
+        )
+
+    # Loop over all episode lengths and collect bootstrap values.
+    # Do not alter incoming `episode_lengths` list.
+    episode_lengths = episode_lengths[:]
+    i = -1
+    while i < len(episode_lengths) - 1:
+        i += 1
+        eps_len = episode_lengths[i]
+        # We can make another T-stride inside this episode ->
+        # - Use a vf prediction within the episode as bootstrapped value.
+        # - "Fix" the episode_lengths array and continue within the same episode.
+        if T < eps_len:
+            bootstrapped_values.append(vf_preds[T])
+            vf_preds = vf_preds[T:]
+            episode_lengths[i] -= T
+            i -= 1
+        # We can make another T-stride inside this episode, but will then be at the end
+        # of it ->
+        # - Use the value function prediction at the artificially added timestep
+        #   as bootstrapped value.
+        # - Skip the additional timestep at the end and ,ove on with next episode.
+        elif T == eps_len:
+            bootstrapped_values.append(vf_preds[T])
+            vf_preds = vf_preds[T + 1 :]
+        # The episode fits entirely into the T-stride ->
+        # - Move on to next episode ("fix" its length by make it seemingly longer).
+        else:
+            # Skip bootstrap value of current episode (not needed).
+            vf_preds = vf_preds[1:]
+            # Make next episode seem longer.
+            episode_lengths[i + 1] += eps_len
+
+    return np.array(bootstrapped_values)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/zero_padding.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/zero_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..08017213d8af231a52e3abd9d42bb0479b4f1c6f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/postprocessing/zero_padding.py
@@ -0,0 +1,234 @@
+from collections import deque
+from typing import List, Tuple, Union
+
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.utils.spaces.space_utils import batch, BatchedNdArray
+from ray.util.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+def create_mask_and_seq_lens(episode_len: int, T: int) -> Tuple[List, List]:
+    """Creates loss mask and a seq_lens array, given an episode length and T.
+
+    Args:
+        episode_lens: A list of episode lengths to infer the loss mask and seq_lens
+            array from.
+        T: The maximum number of timesteps in each "row", also known as the maximum
+            sequence length (max_seq_len). Episodes are split into chunks that are at
+            most `T` long and remaining timesteps will be zero-padded (and masked out).
+
+    Returns:
+         Tuple consisting of a) list of the loss masks to use (masking out areas that
+         are past the end of an episode (or rollout), but had to be zero-added due to
+         the added extra time rank (of length T) and b) the list of sequence lengths
+         resulting from splitting the given episodes into chunks of at most `T`
+         timesteps.
+    """
+    mask = []
+    seq_lens = []
+
+    len_ = min(episode_len, T)
+    seq_lens.append(len_)
+    row = np.array([1] * len_ + [0] * (T - len_), np.bool_)
+    mask.append(row)
+
+    # Handle sequence lengths greater than T.
+    overflow = episode_len - T
+    while overflow > 0:
+        len_ = min(overflow, T)
+        seq_lens.append(len_)
+        extra_row = np.array([1] * len_ + [0] * (T - len_), np.bool_)
+        mask.append(extra_row)
+        overflow -= T
+
+    return mask, seq_lens
+
+
+@DeveloperAPI
+def split_and_zero_pad(
+    item_list: List[Union[BatchedNdArray, np._typing.NDArray, float]],
+    max_seq_len: int,
+) -> List[np._typing.NDArray]:
+    """Splits the contents of `item_list` into a new list of ndarrays and returns it.
+
+    In the returned list, each item is one ndarray of len (axis=0) `max_seq_len`.
+    The last item in the returned list may be (right) zero-padded, if necessary, to
+    reach `max_seq_len`.
+
+    If `item_list` contains one or more `BatchedNdArray` (instead of individual
+    items), these will be split accordingly along their axis=0 to yield the returned
+    structure described above.
+
+    .. testcode::
+
+        from ray.rllib.utils.postprocessing.zero_padding import (
+            BatchedNdArray,
+            split_and_zero_pad,
+        )
+        from ray.rllib.utils.test_utils import check
+
+        # Simple case: `item_list` contains individual floats.
+        check(
+            split_and_zero_pad([0, 1, 2, 3, 4, 5, 6, 7], 5),
+            [[0, 1, 2, 3, 4], [5, 6, 7, 0, 0]],
+        )
+
+        # `item_list` contains BatchedNdArray (ndarrays that explicitly declare they
+        # have a batch axis=0).
+        check(
+            split_and_zero_pad([
+                BatchedNdArray([0, 1]),
+                BatchedNdArray([2, 3, 4, 5]),
+                BatchedNdArray([6, 7, 8]),
+            ], 5),
+            [[0, 1, 2, 3, 4], [5, 6, 7, 8, 0]],
+        )
+
+    Args:
+        item_list: A list of individual items or BatchedNdArrays to be split into
+            `max_seq_len` long pieces (the last of which may be zero-padded).
+        max_seq_len: The maximum length of each item in the returned list.
+
+    Returns:
+        A list of np.ndarrays (all of length `max_seq_len`), which contains the same
+        data as `item_list`, but split into sub-chunks of size `max_seq_len`.
+        The last item in the returned list may be zero-padded, if necessary.
+    """
+    zero_element = tree.map_structure(
+        lambda s: np.zeros_like([s[0]] if isinstance(s, BatchedNdArray) else s),
+        item_list[0],
+    )
+
+    # The replacement list (to be returned) for `items_list`.
+    # Items list contains n individual items.
+    # -> ret will contain m batched rows, where m == n // T and the last row
+    # may be zero padded (until T).
+    ret = []
+
+    # List of the T-axis item, collected to form the next row.
+    current_time_row = []
+    current_t = 0
+
+    item_list = deque(item_list)
+    while len(item_list) > 0:
+        item = item_list.popleft()
+        # `item` is already a batched np.array: Split if necessary.
+        if isinstance(item, BatchedNdArray):
+            t = max_seq_len - current_t
+            current_time_row.append(item[:t])
+            if len(item) <= t:
+                current_t += len(item)
+            else:
+                current_t += t
+                item_list.appendleft(item[t:])
+        # `item` is a single item (no batch axis): Append and continue with next item.
+        else:
+            current_time_row.append(item)
+            current_t += 1
+
+        # `current_time_row` is "full" (max_seq_len): Append as ndarray (with batch
+        # axis) to `ret`.
+        if current_t == max_seq_len:
+            ret.append(
+                batch(
+                    current_time_row,
+                    individual_items_already_have_batch_dim="auto",
+                )
+            )
+            current_time_row = []
+            current_t = 0
+
+    # `current_time_row` is unfinished: Pad, if necessary and append to `ret`.
+    if current_t > 0 and current_t < max_seq_len:
+        current_time_row.extend([zero_element] * (max_seq_len - current_t))
+        ret.append(
+            batch(current_time_row, individual_items_already_have_batch_dim="auto")
+        )
+
+    return ret
+
+
+@DeveloperAPI
+def split_and_zero_pad_n_episodes(nd_array, episode_lens, max_seq_len):
+    ret = []
+
+    # item_list = deque(item_list)
+    cursor = 0
+    for episode_len in episode_lens:
+        # episode_item_list = []
+        items = BatchedNdArray(nd_array[cursor : cursor + episode_len])
+        # episode_item_list.append(items)
+        ret.extend(split_and_zero_pad([items], max_seq_len))
+        cursor += episode_len
+
+    return ret
+
+
+@DeveloperAPI
+def unpad_data_if_necessary(episode_lens, data):
+    """Removes right-side zero-padding from data based on `episode_lens`.
+
+    ..testcode::
+
+        from ray.rllib.utils.postprocessing.zero_padding import unpad_data_if_necessary
+        import numpy as np
+
+        unpadded = unpad_data_if_necessary(
+            episode_lens=[4, 2],
+            data=np.array([
+                [2, 4, 5, 3, 0, 0, 0, 0],
+                [-1, 3, 0, 0, 0, 0, 0, 0],
+            ]),
+        )
+        assert (unpadded == [2, 4, 5, 3, -1, 3]).all()
+
+        unpadded = unpad_data_if_necessary(
+            episode_lens=[1, 5],
+            data=np.array([
+                [2, 0, 0, 0, 0],
+                [-1, -2, -3, -4, -5],
+            ]),
+        )
+        assert (unpadded == [2, -1, -2, -3, -4, -5]).all()
+
+    Args:
+        episode_lens: A list of actual episode lengths.
+        data: A 2D np.ndarray with right-side zero-padded rows.
+
+    Returns:
+        A 1D np.ndarray resulting from concatenation of the un-padded
+        input data along the 0-axis.
+    """
+    # If data des NOT have time dimension, return right away.
+    if len(data.shape) == 1:
+        return data
+
+    # Assert we only have B and T dimensions (meaning this function only operates
+    # on single-float data, such as value function predictions, advantages, or rewards).
+    assert len(data.shape) == 2
+
+    new_data = []
+    row_idx = 0
+
+    T = data.shape[1]
+    for len_ in episode_lens:
+        # Calculate how many full rows this array occupies and how many elements are
+        # in the last, potentially partial row.
+        num_rows, col_idx = divmod(len_, T)
+
+        # If the array spans multiple full rows, fully include these rows.
+        for i in range(num_rows):
+            new_data.append(data[row_idx])
+            row_idx += 1
+
+        # If there are elements in the last, potentially partial row, add this
+        # partial row as well.
+        if col_idx > 0:
+            new_data.append(data[row_idx, :col_idx])
+
+            # Move to the next row for the next array (skip the zero-padding zone).
+            row_idx += 1
+
+    return np.concatenate(new_data)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02768ed91309661a09ad01380a4fe066dea230bc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8449a6d61e062205df11e365cb544ea2e3c3d56
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/__pycache__/env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/env.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..7055f08695fb3b9a945def9556d226766baddad3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/pre_checks/env.py
@@ -0,0 +1,288 @@
+"""Common pre-checks for all RLlib experiments."""
+import logging
+from typing import TYPE_CHECKING, Set
+
+import gymnasium as gym
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.error import ERR_MSG_OLD_GYM_API, UnsupportedSpaceException
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.util import log_once
+
+if TYPE_CHECKING:
+    from ray.rllib.env import MultiAgentEnv
+
+logger = logging.getLogger(__name__)
+
+
+@DeveloperAPI
+def check_multiagent_environments(env: "MultiAgentEnv") -> None:
+    """Checking for common errors in RLlib MultiAgentEnvs.
+
+    Args:
+        env: The env to be checked.
+    """
+    from ray.rllib.env import MultiAgentEnv
+
+    if not isinstance(env, MultiAgentEnv):
+        raise ValueError("The passed env is not a MultiAgentEnv.")
+    elif not (
+        hasattr(env, "observation_space")
+        and hasattr(env, "action_space")
+        and hasattr(env, "_agent_ids")
+    ):
+        if log_once("ma_env_super_ctor_called"):
+            logger.warning(
+                f"Your MultiAgentEnv {env} does not have some or all of the needed "
+                "base-class attributes! Make sure you call `super().__init__()` from "
+                "within your MutiAgentEnv's constructor. "
+                "This will raise an error in the future."
+            )
+        return
+
+    try:
+        obs_and_infos = env.reset(seed=42, options={})
+    except Exception as e:
+        raise ValueError(
+            ERR_MSG_OLD_GYM_API.format(
+                env, "In particular, the `reset()` method seems to be faulty."
+            )
+        ) from e
+    reset_obs, reset_infos = obs_and_infos
+
+    _check_if_element_multi_agent_dict(env, reset_obs, "reset()")
+
+    sampled_action = {
+        aid: env.get_action_space(aid).sample() for aid in reset_obs.keys()
+    }
+    _check_if_element_multi_agent_dict(
+        env, sampled_action, "get_action_space(agent_id=..).sample()"
+    )
+
+    try:
+        results = env.step(sampled_action)
+    except Exception as e:
+        raise ValueError(
+            ERR_MSG_OLD_GYM_API.format(
+                env, "In particular, the `step()` method seems to be faulty."
+            )
+        ) from e
+    next_obs, reward, done, truncated, info = results
+
+    _check_if_element_multi_agent_dict(env, next_obs, "step, next_obs")
+    _check_if_element_multi_agent_dict(env, reward, "step, reward")
+    _check_if_element_multi_agent_dict(env, done, "step, done")
+    _check_if_element_multi_agent_dict(env, truncated, "step, truncated")
+    _check_if_element_multi_agent_dict(env, info, "step, info", allow_common=True)
+    _check_reward({"dummy_env_id": reward}, base_env=True, agent_ids=env.agents)
+    _check_done_and_truncated(
+        {"dummy_env_id": done},
+        {"dummy_env_id": truncated},
+        base_env=True,
+        agent_ids=env.agents,
+    )
+    _check_info({"dummy_env_id": info}, base_env=True, agent_ids=env.agents)
+
+
+def _check_reward(reward, base_env=False, agent_ids=None):
+    if base_env:
+        for _, multi_agent_dict in reward.items():
+            for agent_id, rew in multi_agent_dict.items():
+                if not (
+                    np.isreal(rew)
+                    and not isinstance(rew, bool)
+                    and (
+                        np.isscalar(rew)
+                        or (isinstance(rew, np.ndarray) and rew.shape == ())
+                    )
+                ):
+                    error = (
+                        "Your step function must return rewards that are"
+                        f" integer or float. reward: {rew}. Instead it was a "
+                        f"{type(rew)}"
+                    )
+                    raise ValueError(error)
+                if not (agent_id in agent_ids or agent_id == "__all__"):
+                    error = (
+                        f"Your reward dictionary must have agent ids that belong to "
+                        f"the environment. AgentIDs received from "
+                        f"env.agents are: {agent_ids}"
+                    )
+                    raise ValueError(error)
+    elif not (
+        np.isreal(reward)
+        and not isinstance(reward, bool)
+        and (
+            np.isscalar(reward)
+            or (isinstance(reward, np.ndarray) and reward.shape == ())
+        )
+    ):
+        error = (
+            "Your step function must return a reward that is integer or float. "
+            "Instead it was a {}".format(type(reward))
+        )
+        raise ValueError(error)
+
+
+def _check_done_and_truncated(done, truncated, base_env=False, agent_ids=None):
+    for what in ["done", "truncated"]:
+        data = done if what == "done" else truncated
+        if base_env:
+            for _, multi_agent_dict in data.items():
+                for agent_id, done_ in multi_agent_dict.items():
+                    if not isinstance(done_, (bool, np.bool_)):
+                        raise ValueError(
+                            f"Your step function must return `{what}s` that are "
+                            f"boolean. But instead was a {type(data)}"
+                        )
+                    if not (agent_id in agent_ids or agent_id == "__all__"):
+                        error = (
+                            f"Your `{what}s` dictionary must have agent ids that "
+                            f"belong to the environment. AgentIDs received from "
+                            f"env.agents are: {agent_ids}"
+                        )
+                        raise ValueError(error)
+        elif not isinstance(data, (bool, np.bool_)):
+            error = (
+                f"Your step function must return a `{what}` that is a boolean. But "
+                f"instead was a {type(data)}"
+            )
+            raise ValueError(error)
+
+
+def _check_info(info, base_env=False, agent_ids=None):
+    if base_env:
+        for _, multi_agent_dict in info.items():
+            for agent_id, inf in multi_agent_dict.items():
+                if not isinstance(inf, dict):
+                    raise ValueError(
+                        "Your step function must return infos that are a dict. "
+                        f"instead was a {type(inf)}: element: {inf}"
+                    )
+                if not (
+                    agent_id in agent_ids
+                    or agent_id == "__all__"
+                    or agent_id == "__common__"
+                ):
+                    error = (
+                        f"Your dones dictionary must have agent ids that belong to "
+                        f"the environment. AgentIDs received from "
+                        f"env.agents are: {agent_ids}"
+                    )
+                    raise ValueError(error)
+    elif not isinstance(info, dict):
+        error = (
+            "Your step function must return a info that "
+            f"is a dict. element type: {type(info)}. element: {info}"
+        )
+        raise ValueError(error)
+
+
+def _not_contained_error(func_name, _type):
+    _error = (
+        f"The {_type} collected from {func_name} was not contained within"
+        f" your env's {_type} space. Its possible that there was a type"
+        f"mismatch (for example {_type}s of np.float32 and a space of"
+        f"np.float64 {_type}s), or that one of the sub-{_type}s was"
+        f"out of bounds"
+    )
+    return _error
+
+
+def _check_if_element_multi_agent_dict(
+    env,
+    element,
+    function_string,
+    base_env=False,
+    allow_common=False,
+):
+    if not isinstance(element, dict):
+        if base_env:
+            error = (
+                f"The element returned by {function_string} contains values "
+                f"that are not MultiAgentDicts. Instead, they are of "
+                f"type: {type(element)}"
+            )
+        else:
+            error = (
+                f"The element returned by {function_string} is not a "
+                f"MultiAgentDict. Instead, it is of type: "
+                f" {type(element)}"
+            )
+        raise ValueError(error)
+    agent_ids: Set = set(env.agents)
+    agent_ids.add("__all__")
+    if allow_common:
+        agent_ids.add("__common__")
+
+    if not all(k in agent_ids for k in element):
+        if base_env:
+            error = (
+                f"The element returned by {function_string} has agent_ids"
+                f" that are not the names of the agents in the env."
+                f"agent_ids in this\nMultiEnvDict:"
+                f" {list(element.keys())}\nAgentIDs in this env: "
+                f"{env.agents}"
+            )
+        else:
+            error = (
+                f"The element returned by {function_string} has agent_ids"
+                f" that are not the names of the agents in the env. "
+                f"\nAgentIDs in this MultiAgentDict: "
+                f"{list(element.keys())}\nAgentIDs in this env: "
+                f"{env.agents}. You likely need to add the attribute `agents` to your "
+                f"env, which is a list containing the IDs of agents currently in your "
+                f"env/episode, as well as, `possible_agents`, which is a list of all "
+                f"possible agents that could ever show up in your env."
+            )
+        raise ValueError(error)
+
+
+def _find_offending_sub_space(space, value):
+    """Returns error, value, and space when offending `space.contains(value)` fails.
+
+    Returns only the offending sub-value/sub-space in case `space` is a complex Tuple
+    or Dict space.
+
+    Args:
+        space: The gym.Space to check.
+        value: The actual (numpy) value to check for matching `space`.
+
+    Returns:
+        Tuple consisting of 1) key-sequence of the offending sub-space or the empty
+        string if `space` is not complex (Tuple or Dict), 2) the offending sub-space,
+        3) the offending sub-space's dtype, 4) the offending sub-value, 5) the offending
+        sub-value's dtype.
+
+    .. testcode::
+        :skipif: True
+
+        path, space, space_dtype, value, value_dtype = _find_offending_sub_space(
+            gym.spaces.Dict({
+           -2.0, 1.5, (2, ), np.int8), np.array([-1.5, 3.0])
+        )
+
+    """
+    if not isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple)):
+        return None, space, space.dtype, value, _get_type(value)
+
+    structured_space = get_base_struct_from_space(space)
+
+    def map_fn(p, s, v):
+        if not s.contains(v):
+            raise UnsupportedSpaceException((p, s, v))
+
+    try:
+        tree.map_structure_with_path(map_fn, structured_space, value)
+    except UnsupportedSpaceException as e:
+        space, value = e.args[0][1], e.args[0][2]
+        return "->".join(e.args[0][0]), space, space.dtype, value, _get_type(value)
+
+    # This is actually an error.
+    return None, None, None, None, None
+
+
+def _get_type(var):
+    return var.dtype if hasattr(var, "dtype") else type(var)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/serialization.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..30eb1aacc5d466f623d8efbeaaac64dec5787f9a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/serialization.py
@@ -0,0 +1,418 @@
+import base64
+from collections import OrderedDict
+import importlib
+import io
+import zlib
+from typing import Any, Dict, Optional, Sequence, Type, Union
+
+import gymnasium as gym
+import numpy as np
+
+import ray
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.error import NotSerializable
+from ray.rllib.utils.spaces.flexdict import FlexDict
+from ray.rllib.utils.spaces.repeated import Repeated
+from ray.rllib.utils.spaces.simplex import Simplex
+
+NOT_SERIALIZABLE = "__not_serializable__"
+
+
+@DeveloperAPI
+def convert_numpy_to_python_primitives(obj: Any):
+    """Convert an object that is a numpy type to a python type.
+
+    If the object is not a numpy type, it is returned unchanged.
+
+    Args:
+        obj: The object to convert.
+    """
+    if isinstance(obj, np.integer):
+        return int(obj)
+    elif isinstance(obj, np.floating):
+        return float(obj)
+    elif isinstance(obj, np.bool_):
+        return bool(obj)
+    elif isinstance(obj, np.str_):
+        return str(obj)
+    elif isinstance(obj, np.ndarray):
+        ret = obj.tolist()
+        for i, v in enumerate(ret):
+            ret[i] = convert_numpy_to_python_primitives(v)
+        return ret
+    else:
+        return obj
+
+
+def _serialize_ndarray(array: np.ndarray) -> str:
+    """Pack numpy ndarray into Base64 encoded strings for serialization.
+
+    This function uses numpy.save() instead of pickling to ensure
+    compatibility.
+
+    Args:
+        array: numpy ndarray.
+
+    Returns:
+        b64 escaped string.
+    """
+    buf = io.BytesIO()
+    np.save(buf, array)
+    return base64.b64encode(zlib.compress(buf.getvalue())).decode("ascii")
+
+
+def _deserialize_ndarray(b64_string: str) -> np.ndarray:
+    """Unpack b64 escaped string into numpy ndarray.
+
+    This function assumes the unescaped bytes are of npy format.
+
+    Args:
+        b64_string: Base64 escaped string.
+
+    Returns:
+        numpy ndarray.
+    """
+    return np.load(
+        io.BytesIO(zlib.decompress(base64.b64decode(b64_string))), allow_pickle=True
+    )
+
+
+@DeveloperAPI
+def gym_space_to_dict(space: gym.spaces.Space) -> Dict:
+    """Serialize a gym Space into a JSON-serializable dict.
+
+    Args:
+        space: gym.spaces.Space
+
+    Returns:
+        Serialized JSON string.
+    """
+    if space is None:
+        return None
+
+    def _box(sp: gym.spaces.Box) -> Dict:
+        return {
+            "space": "box",
+            "low": _serialize_ndarray(sp.low),
+            "high": _serialize_ndarray(sp.high),
+            "shape": sp._shape,  # shape is a tuple.
+            "dtype": sp.dtype.str,
+        }
+
+    def _discrete(sp: gym.spaces.Discrete) -> Dict:
+        d = {
+            "space": "discrete",
+            "n": int(sp.n),
+        }
+        # Offset is a relatively new Discrete space feature.
+        if hasattr(sp, "start"):
+            d["start"] = int(sp.start)
+        return d
+
+    def _multi_binary(sp: gym.spaces.MultiBinary) -> Dict:
+        return {
+            "space": "multi-binary",
+            "n": sp.n,
+        }
+
+    def _multi_discrete(sp: gym.spaces.MultiDiscrete) -> Dict:
+        return {
+            "space": "multi-discrete",
+            "nvec": _serialize_ndarray(sp.nvec),
+            "dtype": sp.dtype.str,
+        }
+
+    def _tuple(sp: gym.spaces.Tuple) -> Dict:
+        return {
+            "space": "tuple",
+            "spaces": [gym_space_to_dict(sp) for sp in sp.spaces],
+        }
+
+    def _dict(sp: gym.spaces.Dict) -> Dict:
+        return {
+            "space": "dict",
+            "spaces": {k: gym_space_to_dict(sp) for k, sp in sp.spaces.items()},
+        }
+
+    def _simplex(sp: Simplex) -> Dict:
+        return {
+            "space": "simplex",
+            "shape": sp._shape,  # shape is a tuple.
+            "concentration": sp.concentration,
+            "dtype": sp.dtype.str,
+        }
+
+    def _repeated(sp: Repeated) -> Dict:
+        return {
+            "space": "repeated",
+            "child_space": gym_space_to_dict(sp.child_space),
+            "max_len": sp.max_len,
+        }
+
+    def _flex_dict(sp: FlexDict) -> Dict:
+        d = {
+            "space": "flex_dict",
+        }
+        for k, s in sp.spaces:
+            d[k] = gym_space_to_dict(s)
+        return d
+
+    def _text(sp: "gym.spaces.Text") -> Dict:
+        # Note (Kourosh): This only works in gym >= 0.25.0
+        charset = getattr(sp, "character_set", None)
+        if charset is None:
+            charset = getattr(sp, "charset", None)
+        if charset is None:
+            raise ValueError(
+                "Text space must have a character_set or charset attribute"
+            )
+        return {
+            "space": "text",
+            "min_length": sp.min_length,
+            "max_length": sp.max_length,
+            "charset": charset,
+        }
+
+    if isinstance(space, gym.spaces.Box):
+        return _box(space)
+    elif isinstance(space, gym.spaces.Discrete):
+        return _discrete(space)
+    elif isinstance(space, gym.spaces.MultiBinary):
+        return _multi_binary(space)
+    elif isinstance(space, gym.spaces.MultiDiscrete):
+        return _multi_discrete(space)
+    elif isinstance(space, gym.spaces.Tuple):
+        return _tuple(space)
+    elif isinstance(space, gym.spaces.Dict):
+        return _dict(space)
+    elif isinstance(space, gym.spaces.Text):
+        return _text(space)
+    elif isinstance(space, Simplex):
+        return _simplex(space)
+    elif isinstance(space, Repeated):
+        return _repeated(space)
+    elif isinstance(space, FlexDict):
+        return _flex_dict(space)
+    else:
+        raise ValueError("Unknown space type for serialization, ", type(space))
+
+
+@DeveloperAPI
+def space_to_dict(space: gym.spaces.Space) -> Dict:
+    d = {"space": gym_space_to_dict(space)}
+    if "original_space" in space.__dict__:
+        d["original_space"] = space_to_dict(space.original_space)
+    return d
+
+
+@DeveloperAPI
+def gym_space_from_dict(d: Dict) -> gym.spaces.Space:
+    """De-serialize a dict into gym Space.
+
+    Args:
+        str: serialized JSON str.
+
+    Returns:
+        De-serialized gym space.
+    """
+    if d is None:
+        return None
+
+    def __common(d: Dict):
+        """Common updates to the dict before we use it to construct spaces"""
+        ret = d.copy()
+        del ret["space"]
+        if "dtype" in ret:
+            ret["dtype"] = np.dtype(ret["dtype"])
+        return ret
+
+    def _box(d: Dict) -> gym.spaces.Box:
+        ret = d.copy()
+        ret.update(
+            {
+                "low": _deserialize_ndarray(d["low"]),
+                "high": _deserialize_ndarray(d["high"]),
+            }
+        )
+        return gym.spaces.Box(**__common(ret))
+
+    def _discrete(d: Dict) -> gym.spaces.Discrete:
+        return gym.spaces.Discrete(**__common(d))
+
+    def _multi_binary(d: Dict) -> gym.spaces.MultiBinary:
+        return gym.spaces.MultiBinary(**__common(d))
+
+    def _multi_discrete(d: Dict) -> gym.spaces.MultiDiscrete:
+        ret = d.copy()
+        ret.update(
+            {
+                "nvec": _deserialize_ndarray(ret["nvec"]),
+            }
+        )
+        return gym.spaces.MultiDiscrete(**__common(ret))
+
+    def _tuple(d: Dict) -> gym.spaces.Discrete:
+        spaces = [gym_space_from_dict(sp) for sp in d["spaces"]]
+        return gym.spaces.Tuple(spaces=spaces)
+
+    def _dict(d: Dict) -> gym.spaces.Discrete:
+        # We need to always use an OrderedDict here to cover the following two ways, by
+        # which a user might construct a Dict space originally. We need to restore this
+        # original Dict space with the exact order of keys the user intended to.
+        # - User provides an OrderedDict inside the gym.spaces.Dict constructor ->
+        #  gymnasium should NOT further sort the keys. The same (user-provided) order
+        #  must be restored.
+        # - User provides a simple dict inside the gym.spaces.Dict constructor ->
+        #  By its API definition, gymnasium automatically sorts all keys alphabetically.
+        #  The same (alphabetical) order must thus be restored.
+        spaces = OrderedDict(
+            {k: gym_space_from_dict(sp) for k, sp in d["spaces"].items()}
+        )
+        return gym.spaces.Dict(spaces=spaces)
+
+    def _simplex(d: Dict) -> Simplex:
+        return Simplex(**__common(d))
+
+    def _repeated(d: Dict) -> Repeated:
+        child_space = gym_space_from_dict(d["child_space"])
+        return Repeated(child_space=child_space, max_len=d["max_len"])
+
+    def _flex_dict(d: Dict) -> FlexDict:
+        spaces = {k: gym_space_from_dict(s) for k, s in d.items() if k != "space"}
+        return FlexDict(spaces=spaces)
+
+    def _text(d: Dict) -> "gym.spaces.Text":
+        return gym.spaces.Text(**__common(d))
+
+    space_map = {
+        "box": _box,
+        "discrete": _discrete,
+        "multi-binary": _multi_binary,
+        "multi-discrete": _multi_discrete,
+        "tuple": _tuple,
+        "dict": _dict,
+        "simplex": _simplex,
+        "repeated": _repeated,
+        "flex_dict": _flex_dict,
+        "text": _text,
+    }
+
+    space_type = d["space"]
+    if space_type not in space_map:
+        raise ValueError("Unknown space type for de-serialization, ", space_type)
+
+    return space_map[space_type](d)
+
+
+@DeveloperAPI
+def space_from_dict(d: Dict) -> gym.spaces.Space:
+    space = gym_space_from_dict(d["space"])
+    if "original_space" in d:
+        assert "space" in d["original_space"]
+        if isinstance(d["original_space"]["space"], str):
+            # For backward compatibility reasons, if d["original_space"]["space"]
+            # is a string, this original space was serialized by gym_space_to_dict.
+            space.original_space = gym_space_from_dict(d["original_space"])
+        else:
+            # Otherwise, this original space was serialized by space_to_dict.
+            space.original_space = space_from_dict(d["original_space"])
+    return space
+
+
+@DeveloperAPI
+def check_if_args_kwargs_serializable(args: Sequence[Any], kwargs: Dict[str, Any]):
+    """Check if parameters to a function are serializable by ray.
+
+    Args:
+        args: arguments to be checked.
+        kwargs: keyword arguments to be checked.
+
+    Raises:
+        NoteSerializable if either args are kwargs are not serializable
+            by ray.
+    """
+    for arg in args:
+        try:
+            # if the object is truly serializable we should be able to
+            # ray.put and ray.get it.
+            ray.get(ray.put(arg))
+        except TypeError as e:
+            raise NotSerializable(
+                "RLModule constructor arguments must be serializable. "
+                f"Found non-serializable argument: {arg}.\n"
+                f"Original serialization error: {e}"
+            )
+    for k, v in kwargs.items():
+        try:
+            # if the object is truly serializable we should be able to
+            # ray.put and ray.get it.
+            ray.get(ray.put(v))
+        except TypeError as e:
+            raise NotSerializable(
+                "RLModule constructor arguments must be serializable. "
+                f"Found non-serializable keyword argument: {k} = {v}.\n"
+                f"Original serialization error: {e}"
+            )
+
+
+@DeveloperAPI
+def serialize_type(type_: Union[Type, str]) -> str:
+    """Converts a type into its full classpath ([module file] + "." + [class name]).
+
+    Args:
+        type_: The type to convert.
+
+    Returns:
+        The full classpath of the given type, e.g. "ray.rllib.algorithms.ppo.PPOConfig".
+    """
+    # TODO (avnishn): find a way to incorporate the tune registry here.
+    # Already serialized.
+    if isinstance(type_, str):
+        return type_
+
+    return type_.__module__ + "." + type_.__qualname__
+
+
+@DeveloperAPI
+def deserialize_type(
+    module: Union[str, Type], error: bool = False
+) -> Optional[Union[str, Type]]:
+    """Resolves a class path to a class.
+    If the given module is already a class, it is returned as is.
+    If the given module is a string, it is imported and the class is returned.
+
+    Args:
+        module: The classpath (str) or type to resolve.
+        error: Whether to throw a ValueError if `module` could not be resolved into
+            a class. If False and `module` is not resolvable, returns None.
+
+    Returns:
+        The resolved class or `module` (if `error` is False and no resolution possible).
+
+    Raises:
+        ValueError: If `error` is True and `module` cannot be resolved.
+    """
+    # Already a class, return as-is.
+    if isinstance(module, type):
+        return module
+    # A string.
+    elif isinstance(module, str):
+        # Try interpreting (as classpath) and importing the given module.
+        try:
+            module_path, class_name = module.rsplit(".", 1)
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+        # Module not found OR not a module (but a registered string?).
+        except (ModuleNotFoundError, ImportError, AttributeError, ValueError) as e:
+            # Ignore if error=False.
+            if error:
+                raise ValueError(
+                    f"Could not deserialize the given classpath `module={module}` into "
+                    "a valid python class! Make sure you have all necessary pip "
+                    "packages installed and all custom modules are in your "
+                    "`PYTHONPATH` env variable."
+                ) from e
+    else:
+        raise ValueError(f"`module` ({module} must be type or string (classpath)!")
+
+    return module
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/tensor_dtype.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/tensor_dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..83677d80a46a8048011863f65cf8b851d12dbd9e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/tensor_dtype.py
@@ -0,0 +1,65 @@
+import numpy as np
+
+from ray.rllib.utils.typing import TensorType
+from ray.rllib.utils.framework import try_import_torch, try_import_tf
+from ray.util.annotations import DeveloperAPI
+
+torch, _ = try_import_torch()
+_, tf, _ = try_import_tf()
+
+
+# Dict of NumPy dtype -> torch dtype
+if torch:
+    numpy_to_torch_dtype_dict = {
+        np.bool_: torch.bool,
+        np.uint8: torch.uint8,
+        np.int8: torch.int8,
+        np.int16: torch.int16,
+        np.int32: torch.int32,
+        np.int64: torch.int64,
+        np.float16: torch.float16,
+        np.float32: torch.float32,
+        np.float64: torch.float64,
+        np.complex64: torch.complex64,
+        np.complex128: torch.complex128,
+    }
+else:
+    numpy_to_torch_dtype_dict = {}
+
+# Dict of NumPy dtype -> tf dtype
+if tf:
+    numpy_to_tf_dtype_dict = {
+        np.bool_: tf.bool,
+        np.uint8: tf.uint8,
+        np.int8: tf.int8,
+        np.int16: tf.int16,
+        np.int32: tf.int32,
+        np.int64: tf.int64,
+        np.float16: tf.float16,
+        np.float32: tf.float32,
+        np.float64: tf.float64,
+        np.complex64: tf.complex64,
+        np.complex128: tf.complex128,
+    }
+else:
+    numpy_to_tf_dtype_dict = {}
+
+# Dict of torch dtype -> NumPy dtype
+torch_to_numpy_dtype_dict = {
+    value: key for (key, value) in numpy_to_torch_dtype_dict.items()
+}
+# Dict of tf dtype -> NumPy dtype
+tf_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_tf_dtype_dict.items()}
+
+
+@DeveloperAPI
+def get_np_dtype(x: TensorType) -> np.dtype:
+    """Returns the NumPy dtype of the given tensor or array."""
+    if torch and isinstance(x, torch.Tensor):
+        return torch_to_numpy_dtype_dict[x.dtype]
+    if tf and isinstance(x, tf.Tensor):
+        return tf_to_numpy_dtype_dict[x.dtype]
+    elif isinstance(x, np.ndarray):
+        return x.dtype
+    else:
+        raise TypeError("Unsupported type: {}".format(type(x)))
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/test_utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f739ee9aa1c86b9d138582a19e025297fef3efa
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/test_utils.py
@@ -0,0 +1,1847 @@
+import argparse
+import json
+import logging
+import os
+import pprint
+import random
+import re
+import time
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete, MultiDiscrete, MultiBinary
+from gymnasium.spaces import Dict as GymDict
+from gymnasium.spaces import Tuple as GymTuple
+import numpy as np
+import tree  # pip install dm_tree
+
+import ray
+from ray import train, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.air.integrations.wandb import WandbLoggerCallback, WANDB_ENV_VAR
+from ray.rllib.core import DEFAULT_MODULE_ID, Columns
+from ray.rllib.env.wrappers.atari_wrappers import is_atari, wrap_deepmind
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_jax, try_import_tf, try_import_torch
+from ray.rllib.utils.metrics import (
+    DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY,
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    EVALUATION_RESULTS,
+    NUM_ENV_STEPS_TRAINED,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.typing import ResultDict
+from ray.rllib.utils.error import UnsupportedSpaceException
+
+
+from ray.tune import CLIReporter
+
+
+if TYPE_CHECKING:
+    from ray.rllib.algorithms import Algorithm, AlgorithmConfig
+    from ray.rllib.offline.dataset_reader import DatasetReader
+
+jax, _ = try_import_jax()
+tf1, tf, tfv = try_import_tf()
+torch, _ = try_import_torch()
+
+logger = logging.getLogger(__name__)
+
+
+def add_rllib_example_script_args(
+    parser: Optional[argparse.ArgumentParser] = None,
+    default_reward: float = 100.0,
+    default_iters: int = 200,
+    default_timesteps: int = 100000,
+) -> argparse.ArgumentParser:
+    """Adds RLlib-typical (and common) examples scripts command line args to a parser.
+
+    TODO (sven): This function should be used by most of our examples scripts, which
+     already mostly have this logic in them (but written out).
+
+    Args:
+        parser: The parser to add the arguments to. If None, create a new one.
+        default_reward: The default value for the --stop-reward option.
+        default_iters: The default value for the --stop-iters option.
+        default_timesteps: The default value for the --stop-timesteps option.
+
+    Returns:
+        The altered (or newly created) parser object.
+    """
+    if parser is None:
+        parser = argparse.ArgumentParser()
+
+    # Algo and Algo config options.
+    parser.add_argument(
+        "--algo", type=str, default="PPO", help="The RLlib-registered algorithm to use."
+    )
+    parser.add_argument(
+        "--enable-new-api-stack",
+        action="store_true",
+        help="Whether to use the `enable_rl_module_and_learner` config setting.",
+    )
+    parser.add_argument(
+        "--framework",
+        choices=["tf", "tf2", "torch"],
+        default="torch",
+        help="The DL framework specifier.",
+    )
+    parser.add_argument(
+        "--env",
+        type=str,
+        default=None,
+        help="The gym.Env identifier to run the experiment with.",
+    )
+    parser.add_argument(
+        "--num-env-runners",
+        type=int,
+        default=None,
+        help="The number of (remote) EnvRunners to use for the experiment.",
+    )
+    parser.add_argument(
+        "--num-envs-per-env-runner",
+        type=int,
+        default=None,
+        help="The number of (vectorized) environments per EnvRunner. Note that "
+        "this is identical to the batch size for (inference) action computations.",
+    )
+    parser.add_argument(
+        "--num-agents",
+        type=int,
+        default=0,
+        help="If 0 (default), will run as single-agent. If > 0, will run as "
+        "multi-agent with the environment simply cloned n times and each agent acting "
+        "independently at every single timestep. The overall reward for this "
+        "experiment is then the sum over all individual agents' rewards.",
+    )
+
+    # Evaluation options.
+    parser.add_argument(
+        "--evaluation-num-env-runners",
+        type=int,
+        default=0,
+        help="The number of evaluation (remote) EnvRunners to use for the experiment.",
+    )
+    parser.add_argument(
+        "--evaluation-interval",
+        type=int,
+        default=0,
+        help="Every how many iterations to run one round of evaluation. "
+        "Use 0 (default) to disable evaluation.",
+    )
+    parser.add_argument(
+        "--evaluation-duration",
+        type=lambda v: v if v == "auto" else int(v),
+        default=10,
+        help="The number of evaluation units to run each evaluation round. "
+        "Use `--evaluation-duration-unit` to count either in 'episodes' "
+        "or 'timesteps'. If 'auto', will run as many as possible during train pass ("
+        "`--evaluation-parallel-to-training` must be set then).",
+    )
+    parser.add_argument(
+        "--evaluation-duration-unit",
+        type=str,
+        default="episodes",
+        choices=["episodes", "timesteps"],
+        help="The evaluation duration unit to count by. One of 'episodes' or "
+        "'timesteps'. This unit will be run `--evaluation-duration` times in each "
+        "evaluation round. If `--evaluation-duration=auto`, this setting does not "
+        "matter.",
+    )
+    parser.add_argument(
+        "--evaluation-parallel-to-training",
+        action="store_true",
+        help="Whether to run evaluation parallel to training. This might help speed up "
+        "your overall iteration time. Be aware that when using this option, your "
+        "reported evaluation results are referring to one iteration before the current "
+        "one.",
+    )
+
+    # RLlib logging options.
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="The output directory to write trajectories to, which are collected by "
+        "the algo's EnvRunners.",
+    )
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        default=None,  # None -> use default
+        choices=["INFO", "DEBUG", "WARN", "ERROR"],
+        help="The log-level to be used by the RLlib logger.",
+    )
+
+    # tune.Tuner options.
+    parser.add_argument(
+        "--no-tune",
+        action="store_true",
+        help="Whether to NOT use tune.Tuner(), but rather a simple for-loop calling "
+        "`algo.train()` repeatedly until one of the stop criteria is met.",
+    )
+    parser.add_argument(
+        "--num-samples",
+        type=int,
+        default=1,
+        help="How many (tune.Tuner.fit()) experiments to execute - if possible in "
+        "parallel.",
+    )
+    parser.add_argument(
+        "--max-concurrent-trials",
+        type=int,
+        default=None,
+        help="How many (tune.Tuner) trials to run concurrently.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=2,
+        help="The verbosity level for the `tune.Tuner()` running the experiment.",
+    )
+    parser.add_argument(
+        "--checkpoint-freq",
+        type=int,
+        default=0,
+        help=(
+            "The frequency (in training iterations) with which to create checkpoints. "
+            "Note that if --wandb-key is provided, all checkpoints will "
+            "automatically be uploaded to WandB."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoint-at-end",
+        action="store_true",
+        help=(
+            "Whether to create a checkpoint at the very end of the experiment. "
+            "Note that if --wandb-key is provided, all checkpoints will "
+            "automatically be uploaded to WandB."
+        ),
+    )
+
+    # WandB logging options.
+    parser.add_argument(
+        "--wandb-key",
+        type=str,
+        default=None,
+        help="The WandB API key to use for uploading results.",
+    )
+    parser.add_argument(
+        "--wandb-project",
+        type=str,
+        default=None,
+        help="The WandB project name to use.",
+    )
+    parser.add_argument(
+        "--wandb-run-name",
+        type=str,
+        default=None,
+        help="The WandB run name to use.",
+    )
+
+    # Experiment stopping and testing criteria.
+    parser.add_argument(
+        "--stop-reward",
+        type=float,
+        default=default_reward,
+        help="Reward at which the script should stop training.",
+    )
+    parser.add_argument(
+        "--stop-iters",
+        type=int,
+        default=default_iters,
+        help="The number of iterations to train.",
+    )
+    parser.add_argument(
+        "--stop-timesteps",
+        type=int,
+        default=default_timesteps,
+        help="The number of (environment sampling) timesteps to train.",
+    )
+    parser.add_argument(
+        "--as-test",
+        action="store_true",
+        help="Whether this script should be run as a test. If set, --stop-reward must "
+        "be achieved within --stop-timesteps AND --stop-iters, otherwise this "
+        "script will throw an exception at the end.",
+    )
+    parser.add_argument(
+        "--as-release-test",
+        action="store_true",
+        help="Whether this script should be run as a release test. If set, "
+        "all that applies to the --as-test option is true, plus, a short JSON summary "
+        "will be written into a results file whose location is given by the ENV "
+        "variable `TEST_OUTPUT_JSON`.",
+    )
+
+    # Learner scaling options.
+    parser.add_argument(
+        "--num-learners",
+        type=int,
+        default=None,
+        help="The number of Learners to use. If `None`, use the algorithm's default "
+        "value.",
+    )
+    parser.add_argument(
+        "--num-gpus-per-learner",
+        type=float,
+        default=None,
+        help="The number of GPUs per Learner to use. If `None` and there are enough "
+        "GPUs for all required Learners (--num-learners), use a value of 1, "
+        "otherwise 0.",
+    )
+    parser.add_argument(
+        "--num-aggregator-actors-per-learner",
+        type=int,
+        default=None,
+        help="The number of Aggregator actors to use per Learner. If `None`, use the "
+        "algorithm's default value.",
+    )
+
+    # Ray init options.
+    parser.add_argument("--num-cpus", type=int, default=0)
+    parser.add_argument(
+        "--local-mode",
+        action="store_true",
+        help="Init Ray in local mode for easier debugging.",
+    )
+
+    # Old API stack: config.num_gpus.
+    parser.add_argument(
+        "--num-gpus",
+        type=int,
+        default=None,
+        help="The number of GPUs to use (only on the old API stack).",
+    )
+
+    return parser
+
+
+def check(x, y, decimals=5, atol=None, rtol=None, false=False):
+    """
+    Checks two structures (dict, tuple, list,
+    np.array, float, int, etc..) for (almost) numeric identity.
+    All numbers in the two structures have to match up to `decimal` digits
+    after the floating point. Uses assertions.
+
+    Args:
+        x: The value to be compared (to the expectation: `y`). This
+            may be a Tensor.
+        y: The expected value to be compared to `x`. This must not
+            be a tf-Tensor, but may be a tf/torch-Tensor.
+        decimals: The number of digits after the floating point up to
+            which all numeric values have to match.
+        atol: Absolute tolerance of the difference between x and y
+            (overrides `decimals` if given).
+        rtol: Relative tolerance of the difference between x and y
+            (overrides `decimals` if given).
+        false: Whether to check that x and y are NOT the same.
+    """
+    # A dict type.
+    if isinstance(x, dict):
+        assert isinstance(y, dict), "ERROR: If x is dict, y needs to be a dict as well!"
+        y_keys = set(x.keys())
+        for key, value in x.items():
+            assert key in y, f"ERROR: y does not have x's key='{key}'! y={y}"
+            check(value, y[key], decimals=decimals, atol=atol, rtol=rtol, false=false)
+            y_keys.remove(key)
+        assert not y_keys, "ERROR: y contains keys ({}) that are not in x! y={}".format(
+            list(y_keys), y
+        )
+    # A tuple type.
+    elif isinstance(x, (tuple, list)):
+        assert isinstance(
+            y, (tuple, list)
+        ), "ERROR: If x is tuple/list, y needs to be a tuple/list as well!"
+        assert len(y) == len(
+            x
+        ), "ERROR: y does not have the same length as x ({} vs {})!".format(
+            len(y), len(x)
+        )
+        for i, value in enumerate(x):
+            check(value, y[i], decimals=decimals, atol=atol, rtol=rtol, false=false)
+    # Boolean comparison.
+    elif isinstance(x, (np.bool_, bool)):
+        if false is True:
+            assert bool(x) is not bool(y), f"ERROR: x ({x}) is y ({y})!"
+        else:
+            assert bool(x) is bool(y), f"ERROR: x ({x}) is not y ({y})!"
+    # Nones or primitives (excluding int vs float, which should be compared with
+    # tolerance/decimals as well).
+    elif (
+        x is None
+        or y is None
+        or isinstance(x, str)
+        or (isinstance(x, int) and isinstance(y, int))
+    ):
+        if false is True:
+            assert x != y, f"ERROR: x ({x}) is the same as y ({y})!"
+        else:
+            assert x == y, f"ERROR: x ({x}) is not the same as y ({y})!"
+    # String/byte comparisons.
+    elif (
+        hasattr(x, "dtype") and (x.dtype == object or str(x.dtype).startswith("<U"))
+    ) or isinstance(x, bytes):
+        try:
+            np.testing.assert_array_equal(x, y)
+            if false is True:
+                assert False, f"ERROR: x ({x}) is the same as y ({y})!"
+        except AssertionError as e:
+            if false is False:
+                raise e
+    # Everything else (assume numeric or tf/torch.Tensor).
+    # Also includes int vs float comparison, which is performed with tolerance/decimals.
+    else:
+        if tf1 is not None:
+            # y should never be a Tensor (y=expected value).
+            if isinstance(y, (tf1.Tensor, tf1.Variable)):
+                # In eager mode, numpyize tensors.
+                if tf.executing_eagerly():
+                    y = y.numpy()
+                else:
+                    raise ValueError(
+                        "`y` (expected value) must not be a Tensor. "
+                        "Use numpy.ndarray instead"
+                    )
+            if isinstance(x, (tf1.Tensor, tf1.Variable)):
+                # In eager mode, numpyize tensors.
+                if tf1.executing_eagerly():
+                    x = x.numpy()
+                # Otherwise, use a new tf-session.
+                else:
+                    with tf1.Session() as sess:
+                        x = sess.run(x)
+                        return check(
+                            x, y, decimals=decimals, atol=atol, rtol=rtol, false=false
+                        )
+        if torch is not None:
+            if isinstance(x, torch.Tensor):
+                x = x.detach().cpu().numpy()
+            if isinstance(y, torch.Tensor):
+                y = y.detach().cpu().numpy()
+
+        # Stats objects.
+        from ray.rllib.utils.metrics.stats import Stats
+
+        if isinstance(x, Stats):
+            x = x.peek()
+        if isinstance(y, Stats):
+            y = y.peek()
+
+        # Using decimals.
+        if atol is None and rtol is None:
+            # Assert equality of both values.
+            try:
+                np.testing.assert_almost_equal(x, y, decimal=decimals)
+            # Both values are not equal.
+            except AssertionError as e:
+                # Raise error in normal case.
+                if false is False:
+                    raise e
+            # Both values are equal.
+            else:
+                # If false is set -> raise error (not expected to be equal).
+                if false is True:
+                    assert False, f"ERROR: x ({x}) is the same as y ({y})!"
+
+        # Using atol/rtol.
+        else:
+            # Provide defaults for either one of atol/rtol.
+            if atol is None:
+                atol = 0
+            if rtol is None:
+                rtol = 1e-7
+            try:
+                np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
+            except AssertionError as e:
+                if false is False:
+                    raise e
+            else:
+                if false is True:
+                    assert False, f"ERROR: x ({x}) is the same as y ({y})!"
+
+
+def check_compute_single_action(
+    algorithm, include_state=False, include_prev_action_reward=False
+):
+    """Tests different combinations of args for algorithm.compute_single_action.
+
+    Args:
+        algorithm: The Algorithm object to test.
+        include_state: Whether to include the initial state of the Policy's
+            Model in the `compute_single_action` call.
+        include_prev_action_reward: Whether to include the prev-action and
+            -reward in the `compute_single_action` call.
+
+    Raises:
+        ValueError: If anything unexpected happens.
+    """
+    # Have to import this here to avoid circular dependency.
+    from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
+
+    # Some Algorithms may not abide to the standard API.
+    pid = DEFAULT_POLICY_ID
+    try:
+        # Multi-agent: Pick any learnable policy (or DEFAULT_POLICY if it's the only
+        # one).
+        pid = next(iter(algorithm.env_runner.get_policies_to_train()))
+        pol = algorithm.get_policy(pid)
+    except AttributeError:
+        pol = algorithm.policy
+    # Get the policy's model.
+    model = pol.model
+
+    action_space = pol.action_space
+
+    def _test(
+        what, method_to_test, obs_space, full_fetch, explore, timestep, unsquash, clip
+    ):
+        call_kwargs = {}
+        if what is algorithm:
+            call_kwargs["full_fetch"] = full_fetch
+            call_kwargs["policy_id"] = pid
+
+        obs = obs_space.sample()
+        if isinstance(obs_space, Box):
+            obs = np.clip(obs, -1.0, 1.0)
+        state_in = None
+        if include_state:
+            state_in = model.get_initial_state()
+            if not state_in:
+                state_in = []
+                i = 0
+                while f"state_in_{i}" in model.view_requirements:
+                    state_in.append(
+                        model.view_requirements[f"state_in_{i}"].space.sample()
+                    )
+                    i += 1
+        action_in = action_space.sample() if include_prev_action_reward else None
+        reward_in = 1.0 if include_prev_action_reward else None
+
+        if method_to_test == "input_dict":
+            assert what is pol
+
+            input_dict = {SampleBatch.OBS: obs}
+            if include_prev_action_reward:
+                input_dict[SampleBatch.PREV_ACTIONS] = action_in
+                input_dict[SampleBatch.PREV_REWARDS] = reward_in
+            if state_in:
+                if what.config.get("enable_rl_module_and_learner", False):
+                    input_dict["state_in"] = state_in
+                else:
+                    for i, s in enumerate(state_in):
+                        input_dict[f"state_in_{i}"] = s
+            input_dict_batched = SampleBatch(
+                tree.map_structure(lambda s: np.expand_dims(s, 0), input_dict)
+            )
+            action = pol.compute_actions_from_input_dict(
+                input_dict=input_dict_batched,
+                explore=explore,
+                timestep=timestep,
+                **call_kwargs,
+            )
+            # Unbatch everything to be able to compare against single
+            # action below.
+            # ARS and ES return action batches as lists.
+            if isinstance(action[0], list):
+                action = (np.array(action[0]), action[1], action[2])
+            action = tree.map_structure(lambda s: s[0], action)
+
+            try:
+                action2 = pol.compute_single_action(
+                    input_dict=input_dict,
+                    explore=explore,
+                    timestep=timestep,
+                    **call_kwargs,
+                )
+                # Make sure these are the same, unless we have exploration
+                # switched on (or noisy layers).
+                if not explore and not pol.config.get("noisy"):
+                    check(action, action2)
+            except TypeError:
+                pass
+        else:
+            action = what.compute_single_action(
+                obs,
+                state_in,
+                prev_action=action_in,
+                prev_reward=reward_in,
+                explore=explore,
+                timestep=timestep,
+                unsquash_action=unsquash,
+                clip_action=clip,
+                **call_kwargs,
+            )
+
+        state_out = None
+        if state_in or full_fetch or what is pol:
+            action, state_out, _ = action
+        if state_out:
+            for si, so in zip(tree.flatten(state_in), tree.flatten(state_out)):
+                if tf.is_tensor(si):
+                    # If si is a tensor of Dimensions, we need to convert it
+                    # We expect this to be the case for TF RLModules who's initial
+                    # states are Tf Tensors.
+                    si_shape = si.shape.as_list()
+                else:
+                    si_shape = list(si.shape)
+                check(si_shape, so.shape)
+
+        if unsquash is None:
+            unsquash = what.config["normalize_actions"]
+        if clip is None:
+            clip = what.config["clip_actions"]
+
+        # Test whether unsquash/clipping works on the Algorithm's
+        # compute_single_action method: Both flags should force the action
+        # to be within the space's bounds.
+        if method_to_test == "single" and what == algorithm:
+            if not action_space.contains(action) and (
+                clip or unsquash or not isinstance(action_space, Box)
+            ):
+                raise ValueError(
+                    f"Returned action ({action}) of algorithm/policy {what} "
+                    f"not in Env's action_space {action_space}"
+                )
+            # We are operating in normalized space: Expect only smaller action
+            # values.
+            if (
+                isinstance(action_space, Box)
+                and not unsquash
+                and what.config.get("normalize_actions")
+                and np.any(np.abs(action) > 15.0)
+            ):
+                raise ValueError(
+                    f"Returned action ({action}) of algorithm/policy {what} "
+                    "should be in normalized space, but seems too large/small "
+                    "for that!"
+                )
+
+    # Loop through: Policy vs Algorithm; Different API methods to calculate
+    # actions; unsquash option; clip option; full fetch or not.
+    for what in [pol, algorithm]:
+        if what is algorithm:
+            # Get the obs-space from Workers.env (not Policy) due to possible
+            # pre-processor up front.
+            worker_set = getattr(algorithm, "env_runner_group", None)
+            assert worker_set
+            if not worker_set.local_env_runner:
+                obs_space = algorithm.get_policy(pid).observation_space
+            else:
+                obs_space = worker_set.local_env_runner.for_policy(
+                    lambda p: p.observation_space, policy_id=pid
+                )
+            obs_space = getattr(obs_space, "original_space", obs_space)
+        else:
+            obs_space = pol.observation_space
+
+        for method_to_test in ["single"] + (["input_dict"] if what is pol else []):
+            for explore in [True, False]:
+                for full_fetch in [False, True] if what is algorithm else [False]:
+                    timestep = random.randint(0, 100000)
+                    for unsquash in [True, False, None]:
+                        for clip in [False] if unsquash else [True, False, None]:
+                            print("-" * 80)
+                            print(f"what={what}")
+                            print(f"method_to_test={method_to_test}")
+                            print(f"explore={explore}")
+                            print(f"full_fetch={full_fetch}")
+                            print(f"unsquash={unsquash}")
+                            print(f"clip={clip}")
+                            _test(
+                                what,
+                                method_to_test,
+                                obs_space,
+                                full_fetch,
+                                explore,
+                                timestep,
+                                unsquash,
+                                clip,
+                            )
+
+
+def check_inference_w_connectors(policy, env_name, max_steps: int = 100):
+    """Checks whether the given policy can infer actions from an env with connectors.
+
+    Args:
+        policy: The policy to check.
+        env_name: Name of the environment to check
+        max_steps: The maximum number of steps to run the environment for.
+
+    Raises:
+        ValueError: If the policy cannot infer actions from the environment.
+    """
+    # Avoids circular import
+    from ray.rllib.utils.policy import local_policy_inference
+
+    env = gym.make(env_name)
+
+    # Potentially wrap the env like we do in RolloutWorker
+    if is_atari(env):
+        env = wrap_deepmind(
+            env,
+            dim=policy.config["model"]["dim"],
+            framestack=policy.config["model"].get("framestack"),
+        )
+
+    obs, info = env.reset()
+    reward, terminated, truncated = 0.0, False, False
+    ts = 0
+    while not terminated and not truncated and ts < max_steps:
+        action_out = local_policy_inference(
+            policy,
+            env_id=0,
+            agent_id=0,
+            obs=obs,
+            reward=reward,
+            terminated=terminated,
+            truncated=truncated,
+            info=info,
+        )
+        obs, reward, terminated, truncated, info = env.step(action_out[0][0])
+
+        ts += 1
+
+
+def check_learning_achieved(
+    tune_results: "tune.ResultGrid",
+    min_value: float,
+    evaluation: Optional[bool] = None,
+    metric: str = f"{ENV_RUNNER_RESULTS}/episode_return_mean",
+):
+    """Throws an error if `min_reward` is not reached within tune_results.
+
+    Checks the last iteration found in tune_results for its
+    "episode_return_mean" value and compares it to `min_reward`.
+
+    Args:
+        tune_results: The tune.Tuner().fit() returned results object.
+        min_reward: The min reward that must be reached.
+        evaluation: If True, use `evaluation/env_runners/[metric]`, if False, use
+            `env_runners/[metric]`, if None, use evaluation sampler results if
+            available otherwise, use train sampler results.
+
+    Raises:
+        ValueError: If `min_reward` not reached.
+    """
+    # Get maximum value of `metrics` over all trials
+    # (check if at least one trial achieved some learning, not just the final one).
+    recorded_values = []
+    for _, row in tune_results.get_dataframe().iterrows():
+        if evaluation or (
+            evaluation is None and f"{EVALUATION_RESULTS}/{metric}" in row
+        ):
+            recorded_values.append(row[f"{EVALUATION_RESULTS}/{metric}"])
+        else:
+            recorded_values.append(row[metric])
+    best_value = max(recorded_values)
+    if best_value < min_value:
+        raise ValueError(f"`{metric}` of {min_value} not reached!")
+    print(f"`{metric}` of {min_value} reached! ok")
+
+
+def check_off_policyness(
+    results: ResultDict,
+    upper_limit: float,
+    lower_limit: float = 0.0,
+) -> Optional[float]:
+    """Verifies that the off-policy'ness of some update is within some range.
+
+    Off-policy'ness is defined as the average (across n workers) diff
+    between the number of gradient updates performed on the policy used
+    for sampling vs the number of gradient updates that have been performed
+    on the trained policy (usually the one on the local worker).
+
+    Uses the published DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY metric inside
+    a training results dict and compares to the given bounds.
+
+    Note: Only works with single-agent results thus far.
+
+    Args:
+        results: The training results dict.
+        upper_limit: The upper limit to for the off_policy_ness value.
+        lower_limit: The lower limit to for the off_policy_ness value.
+
+    Returns:
+        The off-policy'ness value (described above).
+
+    Raises:
+        AssertionError: If the value is out of bounds.
+    """
+
+    # Have to import this here to avoid circular dependency.
+    from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+    from ray.rllib.utils.metrics.learner_info import LEARNER_INFO
+
+    # Assert that the off-policy'ness is within the given bounds.
+    learner_info = results["info"][LEARNER_INFO]
+    if DEFAULT_POLICY_ID not in learner_info:
+        return None
+    off_policy_ness = learner_info[DEFAULT_POLICY_ID][
+        DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY
+    ]
+    # Roughly: Reaches up to 0.4 for 2 rollout workers and up to 0.2 for
+    # 1 rollout worker.
+    if not (lower_limit <= off_policy_ness <= upper_limit):
+        raise AssertionError(
+            f"`off_policy_ness` ({off_policy_ness}) is outside the given bounds "
+            f"({lower_limit} - {upper_limit})!"
+        )
+
+    return off_policy_ness
+
+
+def check_train_results_new_api_stack(train_results: ResultDict) -> None:
+    """Checks proper structure of a Algorithm.train() returned dict.
+
+    Args:
+        train_results: The train results dict to check.
+
+    Raises:
+        AssertionError: If `train_results` doesn't have the proper structure or
+            data in it.
+    """
+    # Import these here to avoid circular dependencies.
+    from ray.rllib.utils.metrics import (
+        ENV_RUNNER_RESULTS,
+        FAULT_TOLERANCE_STATS,
+        LEARNER_RESULTS,
+        TIMERS,
+    )
+
+    # Assert that some keys are where we would expect them.
+    for key in [
+        ENV_RUNNER_RESULTS,
+        FAULT_TOLERANCE_STATS,
+        LEARNER_RESULTS,
+        TIMERS,
+        TRAINING_ITERATION,
+        "config",
+    ]:
+        assert (
+            key in train_results
+        ), f"'{key}' not found in `train_results` ({train_results})!"
+
+    # Make sure, `config` is an actual dict, not an AlgorithmConfig object.
+    assert isinstance(
+        train_results["config"], dict
+    ), "`config` in results not a python dict!"
+
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+
+    is_multi_agent = (
+        AlgorithmConfig()
+        .update_from_dict({"policies": train_results["config"]["policies"]})
+        .is_multi_agent
+    )
+
+    # Check in particular the "info" dict.
+    learner_results = train_results[LEARNER_RESULTS]
+
+    # Make sure we have a `DEFAULT_MODULE_ID key if we are not in a
+    # multi-agent setup.
+    if not is_multi_agent:
+        assert len(learner_results) == 0 or DEFAULT_MODULE_ID in learner_results, (
+            f"'{DEFAULT_MODULE_ID}' not found in "
+            f"train_results['{LEARNER_RESULTS}']!"
+        )
+
+    for module_id, module_metrics in learner_results.items():
+        # The ModuleID can be __all_modules__ in multi-agent case when the new learner
+        # stack is enabled.
+        if module_id == "__all_modules__":
+            continue
+
+        # On the new API stack, policy has no LEARNER_STATS_KEY under it anymore.
+        for key, value in module_metrics.items():
+            # Min- and max-stats should be single values.
+            if key.endswith("_min") or key.endswith("_max"):
+                assert np.isscalar(value), f"'key' value not a scalar ({value})!"
+
+    return train_results
+
+
+@OldAPIStack
+def check_train_results(train_results: ResultDict):
+    """Checks proper structure of a Algorithm.train() returned dict.
+
+    Args:
+        train_results: The train results dict to check.
+
+    Raises:
+        AssertionError: If `train_results` doesn't have the proper structure or
+            data in it.
+    """
+    # Import these here to avoid circular dependencies.
+    from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+    from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY
+
+    # Assert that some keys are where we would expect them.
+    for key in [
+        "config",
+        "custom_metrics",
+        ENV_RUNNER_RESULTS,
+        "info",
+        "iterations_since_restore",
+        "num_healthy_workers",
+        "perf",
+        "time_since_restore",
+        "time_this_iter_s",
+        "timers",
+        "time_total_s",
+        TRAINING_ITERATION,
+    ]:
+        assert (
+            key in train_results
+        ), f"'{key}' not found in `train_results` ({train_results})!"
+
+    for key in [
+        "episode_len_mean",
+        "episode_reward_max",
+        "episode_reward_mean",
+        "episode_reward_min",
+        "hist_stats",
+        "policy_reward_max",
+        "policy_reward_mean",
+        "policy_reward_min",
+        "sampler_perf",
+    ]:
+        assert key in train_results[ENV_RUNNER_RESULTS], (
+            f"'{key}' not found in `train_results[ENV_RUNNER_RESULTS]` "
+            f"({train_results[ENV_RUNNER_RESULTS]})!"
+        )
+
+    # Make sure, `config` is an actual dict, not an AlgorithmConfig object.
+    assert isinstance(
+        train_results["config"], dict
+    ), "`config` in results not a python dict!"
+
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+
+    is_multi_agent = (
+        AlgorithmConfig()
+        .update_from_dict({"policies": train_results["config"]["policies"]})
+        .is_multi_agent
+    )
+
+    # Check in particular the "info" dict.
+    info = train_results["info"]
+    assert LEARNER_INFO in info, f"'learner' not in train_results['infos'] ({info})!"
+    assert (
+        "num_steps_trained" in info or NUM_ENV_STEPS_TRAINED in info
+    ), f"'num_(env_)?steps_trained' not in train_results['infos'] ({info})!"
+
+    learner_info = info[LEARNER_INFO]
+
+    # Make sure we have a default_policy key if we are not in a
+    # multi-agent setup.
+    if not is_multi_agent:
+        # APEX algos sometimes have an empty learner info dict (no metrics
+        # collected yet).
+        assert len(learner_info) == 0 or DEFAULT_POLICY_ID in learner_info, (
+            f"'{DEFAULT_POLICY_ID}' not found in "
+            f"train_results['infos']['learner'] ({learner_info})!"
+        )
+
+    for pid, policy_stats in learner_info.items():
+        if pid == "batch_count":
+            continue
+
+        # the pid can be __all__ in multi-agent case when the new learner stack is
+        # enabled.
+        if pid == "__all__":
+            continue
+
+        # On the new API stack, policy has no LEARNER_STATS_KEY under it anymore.
+        if LEARNER_STATS_KEY in policy_stats:
+            learner_stats = policy_stats[LEARNER_STATS_KEY]
+        else:
+            learner_stats = policy_stats
+        for key, value in learner_stats.items():
+            # Min- and max-stats should be single values.
+            if key.startswith("min_") or key.startswith("max_"):
+                assert np.isscalar(value), f"'key' value not a scalar ({value})!"
+
+    return train_results
+
+
+# TODO (sven): Make this the de-facto, well documented, and unified utility for most of
+#  our tests:
+#  - CI (label: "learning_tests")
+#  - release tests (benchmarks)
+#  - example scripts
+def run_rllib_example_script_experiment(
+    base_config: "AlgorithmConfig",
+    args: Optional[argparse.Namespace] = None,
+    *,
+    stop: Optional[Dict] = None,
+    success_metric: Optional[Dict] = None,
+    trainable: Optional[Type] = None,
+    tune_callbacks: Optional[List] = None,
+    keep_config: bool = False,
+    keep_ray_up: bool = False,
+    scheduler=None,
+    progress_reporter=None,
+) -> Union[ResultDict, tune.result_grid.ResultGrid]:
+    """Given an algorithm config and some command line args, runs an experiment.
+
+    There are some constraints on what properties must be defined in `args`.
+    It should ideally be generated via calling
+    `args = add_rllib_example_script_args()`, which can be found in this very module
+    here.
+
+    The function sets up an Algorithm object from the given config (altered by the
+    contents of `args`), then runs the Algorithm via Tune (or manually, if
+    `args.no_tune` is set to True) using the stopping criteria in `stop`.
+
+    At the end of the experiment, if `args.as_test` is True, checks, whether the
+    Algorithm reached the `success_metric` (if None, use `env_runners/
+    episode_return_mean` with a minimum value of `args.stop_reward`).
+
+    See https://github.com/ray-project/ray/tree/master/rllib/examples for an overview
+    of all supported command line options.
+
+    Args:
+        base_config: The AlgorithmConfig object to use for this experiment. This base
+            config will be automatically "extended" based on some of the provided
+            `args`. For example, `args.num_env_runners` is used to set
+            `config.num_env_runners`, etc..
+        args: A argparse.Namespace object, ideally returned by calling
+            `args = add_rllib_example_script_args()`. It must have the following
+            properties defined: `stop_iters`, `stop_reward`, `stop_timesteps`,
+            `no_tune`, `verbose`, `checkpoint_freq`, `as_test`. Optionally, for WandB
+            logging: `wandb_key`, `wandb_project`, `wandb_run_name`.
+        stop: An optional dict mapping ResultDict key strings (using "/" in case of
+            nesting, e.g. "env_runners/episode_return_mean" for referring to
+            `result_dict['env_runners']['episode_return_mean']` to minimum
+            values, reaching of which will stop the experiment). Default is:
+            {
+            "env_runners/episode_return_mean": args.stop_reward,
+            "training_iteration": args.stop_iters,
+            "num_env_steps_sampled_lifetime": args.stop_timesteps,
+            }
+        success_metric: Only relevant if `args.as_test` is True.
+            A dict mapping a single(!) ResultDict key string (using "/" in
+            case of nesting, e.g. "env_runners/episode_return_mean" for referring
+            to `result_dict['env_runners']['episode_return_mean']` to a single(!)
+            minimum value to be reached in order for the experiment to count as
+            successful. If `args.as_test` is True AND this `success_metric` is not
+            reached with the bounds defined by `stop`, will raise an Exception.
+        trainable: The Trainable sub-class to run in the tune.Tuner. If None (default),
+            use the registered RLlib Algorithm class specified by args.algo.
+        tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner.
+            In case `args.wandb_key` is provided, appends a WandB logger to this
+            list.
+        keep_config: Set this to True, if you don't want this utility to change the
+            given `base_config` in any way and leave it as-is. This is helpful
+            for those example scripts which demonstrate how to set config settings
+            that are otherwise taken care of automatically in this function (e.g.
+            `num_env_runners`).
+
+    Returns:
+        The last ResultDict from a --no-tune run OR the tune.Tuner.fit()
+        results.
+    """
+    if args is None:
+        parser = add_rllib_example_script_args()
+        args = parser.parse_args()
+
+    # If run --as-release-test, --as-test must also be set.
+    if args.as_release_test:
+        args.as_test = True
+
+    # Initialize Ray.
+    ray.init(
+        num_cpus=args.num_cpus or None,
+        local_mode=args.local_mode,
+        ignore_reinit_error=True,
+    )
+
+    # Define one or more stopping criteria.
+    if stop is None:
+        stop = {
+            f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+            f"{ENV_RUNNER_RESULTS}/{NUM_ENV_STEPS_SAMPLED_LIFETIME}": (
+                args.stop_timesteps
+            ),
+            TRAINING_ITERATION: args.stop_iters,
+        }
+
+    config = base_config
+
+    # Enhance the `base_config`, based on provided `args`.
+    if not keep_config:
+        # Set the framework.
+        config.framework(args.framework)
+
+        # Add an env specifier (only if not already set in config)?
+        if args.env is not None and config.env is None:
+            config.environment(args.env)
+
+        # Disable the new API stack?
+        if not args.enable_new_api_stack:
+            config.api_stack(
+                enable_rl_module_and_learner=False,
+                enable_env_runner_and_connector_v2=False,
+            )
+
+        # Define EnvRunner scaling and behavior.
+        if args.num_env_runners is not None:
+            config.env_runners(num_env_runners=args.num_env_runners)
+        if args.num_envs_per_env_runner is not None:
+            config.env_runners(num_envs_per_env_runner=args.num_envs_per_env_runner)
+
+        # Define compute resources used automatically (only using the --num-learners
+        # and --num-gpus-per-learner args).
+        # New stack.
+        if config.enable_rl_module_and_learner:
+            if args.num_gpus is not None and args.num_gpus > 0:
+                raise ValueError(
+                    "--num-gpus is not supported on the new API stack! To train on "
+                    "GPUs, use the command line options `--num-gpus-per-learner=1` and "
+                    "`--num-learners=[your number of available GPUs]`, instead."
+                )
+
+            # Do we have GPUs available in the cluster?
+            num_gpus_available = ray.cluster_resources().get("GPU", 0)
+            # Number of actual Learner instances (including the local Learner if
+            # `num_learners=0`).
+            num_actual_learners = (
+                args.num_learners
+                if args.num_learners is not None
+                else config.num_learners
+            ) or 1  # 1: There is always a local Learner, if num_learners=0.
+            # How many were hard-requested by the user
+            # (through explicit `--num-gpus-per-learner >= 1`).
+            num_gpus_requested = (args.num_gpus_per_learner or 0) * num_actual_learners
+            # Number of GPUs needed, if `num_gpus_per_learner=None` (auto).
+            num_gpus_needed_if_available = (
+                args.num_gpus_per_learner
+                if args.num_gpus_per_learner is not None
+                else 1
+            ) * num_actual_learners
+            # Define compute resources used.
+            config.resources(num_gpus=0)  # old API stack setting
+            if args.num_learners is not None:
+                config.learners(num_learners=args.num_learners)
+
+            # User wants to use aggregator actors per Learner.
+            if args.num_aggregator_actors_per_learner is not None:
+                config.learners(
+                    num_aggregator_actors_per_learner=(
+                        args.num_aggregator_actors_per_learner
+                    )
+                )
+
+            # User wants to use GPUs if available, but doesn't hard-require them.
+            if args.num_gpus_per_learner is None:
+                if num_gpus_available >= num_gpus_needed_if_available:
+                    config.learners(num_gpus_per_learner=1)
+                else:
+                    config.learners(num_gpus_per_learner=0, num_cpus_per_learner=1)
+
+            # User hard-requires n GPUs, but they are not available -> Error.
+            elif num_gpus_available < num_gpus_requested:
+                raise ValueError(
+                    "You are running your script with --num-learners="
+                    f"{args.num_learners} and --num-gpus-per-learner="
+                    f"{args.num_gpus_per_learner}, but your cluster only has "
+                    f"{num_gpus_available} GPUs! Will run "
+                    f"with {num_gpus_available} CPU Learners instead."
+                )
+
+            # All required GPUs are available -> Use them.
+            else:
+                config.learners(num_gpus_per_learner=args.num_gpus_per_learner)
+
+        # Old stack (override only if arg was provided by user).
+        elif args.num_gpus is not None:
+            config.resources(num_gpus=args.num_gpus)
+
+        # Evaluation setup.
+        if args.evaluation_interval > 0:
+            config.evaluation(
+                evaluation_num_env_runners=args.evaluation_num_env_runners,
+                evaluation_interval=args.evaluation_interval,
+                evaluation_duration=args.evaluation_duration,
+                evaluation_duration_unit=args.evaluation_duration_unit,
+                evaluation_parallel_to_training=args.evaluation_parallel_to_training,
+            )
+
+        # Set the log-level (if applicable).
+        if args.log_level is not None:
+            config.debugging(log_level=args.log_level)
+
+        # Set the output dir (if applicable).
+        if args.output is not None:
+            config.offline_data(output=args.output)
+
+    # Run the experiment w/o Tune (directly operate on the RLlib Algorithm object).
+    if args.no_tune:
+        assert not args.as_test and not args.as_release_test
+        algo = config.build()
+        for i in range(stop.get(TRAINING_ITERATION, args.stop_iters)):
+            results = algo.train()
+            if ENV_RUNNER_RESULTS in results:
+                mean_return = results[ENV_RUNNER_RESULTS].get(
+                    EPISODE_RETURN_MEAN, np.nan
+                )
+                print(f"iter={i} R={mean_return}", end="")
+            if EVALUATION_RESULTS in results:
+                Reval = results[EVALUATION_RESULTS][ENV_RUNNER_RESULTS][
+                    EPISODE_RETURN_MEAN
+                ]
+                print(f" R(eval)={Reval}", end="")
+            print()
+            for key, threshold in stop.items():
+                val = results
+                for k in key.split("/"):
+                    try:
+                        val = val[k]
+                    except KeyError:
+                        val = None
+                        break
+                if val is not None and not np.isnan(val) and val >= threshold:
+                    print(f"Stop criterium ({key}={threshold}) fulfilled!")
+                    if not keep_ray_up:
+                        ray.shutdown()
+                    return results
+
+        if not keep_ray_up:
+            ray.shutdown()
+        return results
+
+    # Run the experiment using Ray Tune.
+
+    # Log results using WandB.
+    tune_callbacks = tune_callbacks or []
+    if hasattr(args, "wandb_key") and (
+        args.wandb_key is not None or WANDB_ENV_VAR in os.environ
+    ):
+        wandb_key = args.wandb_key or os.environ[WANDB_ENV_VAR]
+        project = args.wandb_project or (
+            args.algo.lower() + "-" + re.sub("\\W+", "-", str(config.env).lower())
+        )
+        tune_callbacks.append(
+            WandbLoggerCallback(
+                api_key=wandb_key,
+                project=project,
+                upload_checkpoints=True,
+                **({"name": args.wandb_run_name} if args.wandb_run_name else {}),
+            )
+        )
+    # Auto-configure a CLIReporter (to log the results to the console).
+    # Use better ProgressReporter for multi-agent cases: List individual policy rewards.
+    if progress_reporter is None and args.num_agents > 0:
+        progress_reporter = CLIReporter(
+            metric_columns={
+                **{
+                    TRAINING_ITERATION: "iter",
+                    "time_total_s": "total time (s)",
+                    NUM_ENV_STEPS_SAMPLED_LIFETIME: "ts",
+                    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": "combined return",
+                },
+                **{
+                    (
+                        f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/" f"{pid}"
+                    ): f"return {pid}"
+                    for pid in config.policies
+                },
+            },
+        )
+
+    # Force Tuner to use old progress output as the new one silently ignores our custom
+    # `CLIReporter`.
+    os.environ["RAY_AIR_NEW_OUTPUT"] = "0"
+
+    # Run the actual experiment (using Tune).
+    start_time = time.time()
+    results = tune.Tuner(
+        trainable or config.algo_class,
+        param_space=config,
+        run_config=train.RunConfig(
+            stop=stop,
+            verbose=args.verbose,
+            callbacks=tune_callbacks,
+            checkpoint_config=train.CheckpointConfig(
+                checkpoint_frequency=args.checkpoint_freq,
+                checkpoint_at_end=args.checkpoint_at_end,
+            ),
+            progress_reporter=progress_reporter,
+        ),
+        tune_config=tune.TuneConfig(
+            num_samples=args.num_samples,
+            max_concurrent_trials=args.max_concurrent_trials,
+            scheduler=scheduler,
+        ),
+    ).fit()
+    time_taken = time.time() - start_time
+
+    if not keep_ray_up:
+        ray.shutdown()
+
+    # Error out, if Tuner.fit() failed to run. Otherwise, erroneous examples might pass
+    # the CI tests w/o us knowing that they are broken (b/c some examples do not have
+    # a --as-test flag and/or any passing criteris).
+    if results.errors:
+        raise RuntimeError(
+            "Running the example script resulted in one or more errors! "
+            f"{[e.args[0].args[2] for e in results.errors]}"
+        )
+
+    # If run as a test, check whether we reached the specified success criteria.
+    test_passed = False
+    if args.as_test:
+        # Success metric not provided, try extracting it from `stop`.
+        if success_metric is None:
+            for try_it in [
+                f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}",
+                f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}",
+            ]:
+                if try_it in stop:
+                    success_metric = {try_it: stop[try_it]}
+                    break
+            if success_metric is None:
+                success_metric = {
+                    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+                }
+        # TODO (sven): Make this work for more than one metric (AND-logic?).
+        # Get maximum value of `metric` over all trials
+        # (check if at least one trial achieved some learning, not just the final one).
+        success_metric_key, success_metric_value = next(iter(success_metric.items()))
+        best_value = max(
+            row[success_metric_key] for _, row in results.get_dataframe().iterrows()
+        )
+        if best_value >= success_metric_value:
+            test_passed = True
+            print(f"`{success_metric_key}` of {success_metric_value} reached! ok")
+
+        if args.as_release_test:
+            trial = results._experiment_analysis.trials[0]
+            stats = trial.last_result
+            stats.pop("config", None)
+            json_summary = {
+                "time_taken": float(time_taken),
+                "trial_states": [trial.status],
+                "last_update": float(time.time()),
+                "stats": stats,
+                "passed": [test_passed],
+                "not_passed": [not test_passed],
+                "failures": {str(trial): 1} if not test_passed else {},
+            }
+            with open(
+                os.environ.get("TEST_OUTPUT_JSON", "/tmp/learning_test.json"),
+                "wt",
+            ) as f:
+                try:
+                    json.dump(json_summary, f)
+                # Something went wrong writing json. Try again w/ simplified stats.
+                except Exception:
+                    from ray.rllib.algorithms.algorithm import Algorithm
+
+                    simplified_stats = {
+                        k: stats[k] for k in Algorithm._progress_metrics if k in stats
+                    }
+                    json_summary["stats"] = simplified_stats
+                    json.dump(json_summary, f)
+
+        if not test_passed:
+            raise ValueError(
+                f"`{success_metric_key}` of {success_metric_value} not reached!"
+            )
+
+    return results
+
+
+def check_same_batch(batch1, batch2) -> None:
+    """Check if both batches are (almost) identical.
+
+    For MultiAgentBatches, the step count and individual policy's
+    SampleBatches are checked for identity. For SampleBatches, identity is
+    checked as the almost numerical key-value-pair identity between batches
+    with ray.rllib.utils.test_utils.check(). unroll_id is compared only if
+    both batches have an unroll_id.
+
+    Args:
+        batch1: Batch to compare against batch2
+        batch2: Batch to compare against batch1
+    """
+    # Avoids circular import
+    from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
+
+    assert type(batch1) is type(
+        batch2
+    ), "Input batches are of different types {} and {}".format(
+        str(type(batch1)), str(type(batch2))
+    )
+
+    def check_sample_batches(_batch1, _batch2, _policy_id=None):
+        unroll_id_1 = _batch1.get("unroll_id", None)
+        unroll_id_2 = _batch2.get("unroll_id", None)
+        # unroll IDs only have to fit if both batches have them
+        if unroll_id_1 is not None and unroll_id_2 is not None:
+            assert unroll_id_1 == unroll_id_2
+
+        batch1_keys = set()
+        for k, v in _batch1.items():
+            # unroll_id is compared above already
+            if k == "unroll_id":
+                continue
+            check(v, _batch2[k])
+            batch1_keys.add(k)
+
+        batch2_keys = set(_batch2.keys())
+        # unroll_id is compared above already
+        batch2_keys.discard("unroll_id")
+        _difference = batch1_keys.symmetric_difference(batch2_keys)
+
+        # Cases where one batch has info and the other has not
+        if _policy_id:
+            assert not _difference, (
+                "SampleBatches for policy with ID {} "
+                "don't share information on the "
+                "following information: \n{}"
+                "".format(_policy_id, _difference)
+            )
+        else:
+            assert not _difference, (
+                "SampleBatches don't share information "
+                "on the following information: \n{}"
+                "".format(_difference)
+            )
+
+    if type(batch1) is SampleBatch:
+        check_sample_batches(batch1, batch2)
+    elif type(batch1) is MultiAgentBatch:
+        assert batch1.count == batch2.count
+        batch1_ids = set()
+        for policy_id, policy_batch in batch1.policy_batches.items():
+            check_sample_batches(
+                policy_batch, batch2.policy_batches[policy_id], policy_id
+            )
+            batch1_ids.add(policy_id)
+
+        # Case where one ma batch has info on a policy the other has not
+        batch2_ids = set(batch2.policy_batches.keys())
+        difference = batch1_ids.symmetric_difference(batch2_ids)
+        assert (
+            not difference
+        ), f"MultiAgentBatches don't share the following information: \n{difference}."
+    else:
+        raise ValueError("Unsupported batch type " + str(type(batch1)))
+
+
+def check_reproducibilty(
+    algo_class: Type["Algorithm"],
+    algo_config: "AlgorithmConfig",
+    *,
+    fw_kwargs: Dict[str, Any],
+    training_iteration: int = 1,
+) -> None:
+    # TODO @kourosh: we can get rid of examples/deterministic_training.py once
+    # this is added to all algorithms
+    """Check if the algorithm is reproducible across different testing conditions:
+
+        frameworks: all input frameworks
+        num_gpus: int(os.environ.get("RLLIB_NUM_GPUS", "0"))
+        num_workers: 0 (only local workers) or
+                     4 ((1) local workers + (4) remote workers)
+        num_envs_per_env_runner: 2
+
+    Args:
+        algo_class: Algorithm class to test.
+        algo_config: Base config to use for the algorithm.
+        fw_kwargs: Framework iterator keyword arguments.
+        training_iteration: Number of training iterations to run.
+
+    Returns:
+        None
+
+    Raises:
+        It raises an AssertionError if the algorithm is not reproducible.
+    """
+    from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
+    from ray.rllib.utils.metrics.learner_info import LEARNER_INFO
+
+    stop_dict = {TRAINING_ITERATION: training_iteration}
+    # use 0 and 2 workers (for more that 4 workers we have to make sure the instance
+    # type in ci build has enough resources)
+    for num_workers in [0, 2]:
+        algo_config = (
+            algo_config.debugging(seed=42).env_runners(
+                num_env_runners=num_workers, num_envs_per_env_runner=2
+            )
+            # new API
+            .learners(
+                num_gpus_per_learner=int(os.environ.get("RLLIB_NUM_GPUS", "0")),
+            )
+            # old API
+            .resources(
+                num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")),
+            )
+        )
+
+        print(
+            f"Testing reproducibility of {algo_class.__name__}"
+            f" with {num_workers} workers"
+        )
+        print("/// config")
+        pprint.pprint(algo_config.to_dict())
+        # test tune.Tuner().fit() reproducibility
+        results1 = tune.Tuner(
+            algo_class,
+            param_space=algo_config.to_dict(),
+            run_config=train.RunConfig(stop=stop_dict, verbose=1),
+        ).fit()
+        results1 = results1.get_best_result().metrics
+
+        results2 = tune.Tuner(
+            algo_class,
+            param_space=algo_config.to_dict(),
+            run_config=train.RunConfig(stop=stop_dict, verbose=1),
+        ).fit()
+        results2 = results2.get_best_result().metrics
+
+        # Test rollout behavior.
+        check(
+            results1[ENV_RUNNER_RESULTS]["hist_stats"],
+            results2[ENV_RUNNER_RESULTS]["hist_stats"],
+        )
+        # As well as training behavior (minibatch sequence during SGD
+        # iterations).
+        # As well as training behavior (minibatch sequence during SGD
+        # iterations).
+        if algo_config.enable_rl_module_and_learner:
+            check(
+                results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID],
+                results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID],
+            )
+        else:
+            check(
+                results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID]["learner_stats"],
+                results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID]["learner_stats"],
+            )
+
+
+def get_cartpole_dataset_reader(batch_size: int = 1) -> "DatasetReader":
+    """Returns a DatasetReader for the cartpole dataset.
+    Args:
+        batch_size: The batch size to use for the reader.
+    Returns:
+        A rllib DatasetReader for the cartpole dataset.
+    """
+    from ray.rllib.algorithms import AlgorithmConfig
+    from ray.rllib.offline import IOContext
+    from ray.rllib.offline.dataset_reader import (
+        DatasetReader,
+        get_dataset_and_shards,
+    )
+
+    path = "tests/data/cartpole/large.json"
+    input_config = {"format": "json", "paths": path}
+    dataset, _ = get_dataset_and_shards(
+        AlgorithmConfig().offline_data(input_="dataset", input_config=input_config)
+    )
+    ioctx = IOContext(
+        config=(
+            AlgorithmConfig()
+            .training(train_batch_size=batch_size)
+            .offline_data(actions_in_input_normalized=True)
+        ),
+        worker_index=0,
+    )
+    reader = DatasetReader(dataset, ioctx)
+    return reader
+
+
+class ModelChecker:
+    """Helper class to compare architecturally identical Models across frameworks.
+
+    Holds a ModelConfig, such that individual models can be added simply via their
+    framework string (by building them with config.build(framework=...).
+    A call to `check()` forces all added models to be compared in terms of their
+    number of trainable and non-trainable parameters, as well as, their
+    computation results given a common weights structure and values and identical
+    inputs to the models.
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+        # To compare number of params between frameworks.
+        self.param_counts = {}
+        # To compare computed outputs from fixed-weights-nets between frameworks.
+        self.output_values = {}
+
+        # We will pass an observation filled with this one random value through
+        # all DL networks (after they have been set to fixed-weights) to compare
+        # the computed outputs.
+        self.random_fill_input_value = np.random.uniform(-0.01, 0.01)
+
+        # Dict of models to check against each other.
+        self.models = {}
+
+    def add(self, framework: str = "torch", obs=True, state=False) -> Any:
+        """Builds a new Model for the given framework."""
+        model = self.models[framework] = self.config.build(framework=framework)
+
+        # Pass a B=1 observation through the model.
+        inputs = np.full(
+            [1] + ([1] if state else []) + list(self.config.input_dims),
+            self.random_fill_input_value,
+        )
+        if obs:
+            inputs = {Columns.OBS: inputs}
+        if state:
+            inputs[Columns.STATE_IN] = tree.map_structure(
+                lambda s: np.zeros(shape=[1] + list(s)), state
+            )
+        if framework == "torch":
+            from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+
+            inputs = convert_to_torch_tensor(inputs)
+        # w/ old specs: inputs = model.input_specs.fill(self.random_fill_input_value)
+
+        outputs = model(inputs)
+
+        # Bring model into a reproducible, comparable state (so we can compare
+        # computations across frameworks). Use only a value-sequence of len=1 here
+        # as it could possibly be that the layers are stored in different order
+        # across the different frameworks.
+        model._set_to_dummy_weights(value_sequence=(self.random_fill_input_value,))
+
+        # Perform another forward pass.
+        comparable_outputs = model(inputs)
+
+        # Store the number of parameters for this framework's net.
+        self.param_counts[framework] = model.get_num_parameters()
+        # Store the fixed-weights-net outputs for this framework's net.
+        if framework == "torch":
+            self.output_values[framework] = tree.map_structure(
+                lambda s: s.detach().numpy() if s is not None else None,
+                comparable_outputs,
+            )
+        else:
+            self.output_values[framework] = tree.map_structure(
+                lambda s: s.numpy() if s is not None else None, comparable_outputs
+            )
+        return outputs
+
+    def check(self):
+        """Compares all added Models with each other and possibly raises errors."""
+
+        main_key = next(iter(self.models.keys()))
+        # Compare number of trainable and non-trainable params between all
+        # frameworks.
+        for c in self.param_counts.values():
+            check(c, self.param_counts[main_key])
+
+        # Compare dummy outputs by exact values given that all nets received the
+        # same input and all nets have the same (dummy) weight values.
+        for v in self.output_values.values():
+            check(v, self.output_values[main_key], atol=0.0005)
+
+
+def _get_mean_action_from_algorithm(alg: "Algorithm", obs: np.ndarray) -> np.ndarray:
+    """Returns the mean action computed by the given algorithm.
+
+    Note: This makes calls to `Algorithm.compute_single_action`
+
+    Args:
+        alg: The constructed algorithm to run inference on.
+        obs: The observation to compute the action for.
+
+    Returns:
+        The mean action computed by the algorithm over 5000 samples.
+
+    """
+    out = []
+    for _ in range(5000):
+        out.append(float(alg.compute_single_action(obs)))
+    return np.mean(out)
+
+
+def check_supported_spaces(
+    alg: str,
+    config: "AlgorithmConfig",
+    train: bool = True,
+    check_bounds: bool = False,
+    frameworks: Optional[Tuple[str]] = None,
+    use_gpu: bool = False,
+):
+    """Checks whether the given algorithm supports different action and obs spaces.
+
+        Performs the checks by constructing an rllib algorithm from the config and
+        checking to see that the model inside the policy is the correct one given
+        the action and obs spaces. For example if the action space is discrete and
+        the obs space is an image, then the model should be a vision network with
+        a categorical action distribution.
+
+    Args:
+        alg: The name of the algorithm to test.
+        config: The config to use for the algorithm.
+        train: Whether to train the algorithm for a few iterations.
+        check_bounds: Whether to check the bounds of the action space.
+        frameworks: The frameworks to test the algorithm with.
+        use_gpu: Whether to check support for training on a gpu.
+
+
+    """
+    # Do these imports here because otherwise we have circular imports.
+    from ray.rllib.examples.envs.classes.random_env import RandomEnv
+    from ray.rllib.models.torch.complex_input_net import (
+        ComplexInputNetwork as TorchComplexNet,
+    )
+    from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet
+    from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNet
+
+    action_spaces_to_test = {
+        # Test discrete twice here until we support multi_binary action spaces
+        "discrete": Discrete(5),
+        "continuous": Box(-1.0, 1.0, (5,), dtype=np.float32),
+        "int_actions": Box(0, 3, (2, 3), dtype=np.int32),
+        "multidiscrete": MultiDiscrete([1, 2, 3, 4]),
+        "tuple": GymTuple(
+            [Discrete(2), Discrete(3), Box(-1.0, 1.0, (5,), dtype=np.float32)]
+        ),
+        "dict": GymDict(
+            {
+                "action_choice": Discrete(3),
+                "parameters": Box(-1.0, 1.0, (1,), dtype=np.float32),
+                "yet_another_nested_dict": GymDict(
+                    {"a": GymTuple([Discrete(2), Discrete(3)])}
+                ),
+            }
+        ),
+    }
+
+    observation_spaces_to_test = {
+        "multi_binary": MultiBinary([3, 10, 10]),
+        "discrete": Discrete(5),
+        "continuous": Box(-1.0, 1.0, (5,), dtype=np.float32),
+        "vector2d": Box(-1.0, 1.0, (5, 5), dtype=np.float32),
+        "image": Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32),
+        "tuple": GymTuple([Discrete(10), Box(-1.0, 1.0, (5,), dtype=np.float32)]),
+        "dict": GymDict(
+            {
+                "task": Discrete(10),
+                "position": Box(-1.0, 1.0, (5,), dtype=np.float32),
+            }
+        ),
+    }
+
+    # The observation spaces that we test RLModules with
+    rlmodule_supported_observation_spaces = [
+        "multi_binary",
+        "discrete",
+        "continuous",
+        "image",
+        "tuple",
+        "dict",
+    ]
+
+    # The action spaces that we test RLModules with
+    rlmodule_supported_action_spaces = ["discrete", "continuous"]
+
+    default_observation_space = default_action_space = "discrete"
+
+    config["log_level"] = "ERROR"
+    config["env"] = RandomEnv
+
+    def _do_check(alg, config, a_name, o_name):
+        # We need to copy here so that this validation does not affect the actual
+        # validation method call further down the line.
+        config_copy = config.copy()
+        config_copy.validate()
+        # If RLModules are enabled, we need to skip a few tests for now:
+        if config_copy.enable_rl_module_and_learner:
+            # Skip PPO cases in which RLModules don't support the given spaces yet.
+            if o_name not in rlmodule_supported_observation_spaces:
+                logger.warning(
+                    "Skipping PPO test with RLModules for obs space {}".format(o_name)
+                )
+                return
+            if a_name not in rlmodule_supported_action_spaces:
+                logger.warning(
+                    "Skipping PPO test with RLModules for action space {}".format(
+                        a_name
+                    )
+                )
+                return
+
+        fw = config["framework"]
+        action_space = action_spaces_to_test[a_name]
+        obs_space = observation_spaces_to_test[o_name]
+        print(
+            "=== Testing {} (fw={}) action_space={} obs_space={} ===".format(
+                alg, fw, action_space, obs_space
+            )
+        )
+        t0 = time.time()
+        config.update_from_dict(
+            dict(
+                env_config=dict(
+                    action_space=action_space,
+                    observation_space=obs_space,
+                    reward_space=Box(1.0, 1.0, shape=(), dtype=np.float32),
+                    p_terminated=1.0,
+                    check_action_bounds=check_bounds,
+                )
+            )
+        )
+        stat = "ok"
+
+        try:
+            algo = config.build()
+        except ray.exceptions.RayActorError as e:
+            if len(e.args) >= 2 and isinstance(e.args[2], UnsupportedSpaceException):
+                stat = "unsupported"
+            elif isinstance(e.args[0].args[2], UnsupportedSpaceException):
+                stat = "unsupported"
+            else:
+                raise
+        except UnsupportedSpaceException:
+            stat = "unsupported"
+        else:
+            if alg not in ["SAC", "PPO"]:
+                # 2D (image) input: Expect VisionNet.
+                if o_name in ["atari", "image"]:
+                    assert isinstance(algo.get_policy().model, TorchVisionNet)
+                # 1D input: Expect FCNet.
+                elif o_name == "continuous":
+                    assert isinstance(algo.get_policy().model, TorchFCNet)
+                # Could be either one: ComplexNet (if disabled Preprocessor)
+                # or FCNet (w/ Preprocessor).
+                elif o_name == "vector2d":
+                    assert isinstance(
+                        algo.get_policy().model, (TorchComplexNet, TorchFCNet)
+                    )
+            if train:
+                algo.train()
+            algo.stop()
+        print("Test: {}, ran in {}s".format(stat, time.time() - t0))
+
+    if not frameworks:
+        frameworks = ("tf2", "tf", "torch")
+
+    _do_check_remote = ray.remote(_do_check)
+    _do_check_remote = _do_check_remote.options(num_gpus=1 if use_gpu else 0)
+    # Test all action spaces first.
+    for a_name in action_spaces_to_test.keys():
+        o_name = default_observation_space
+        ray.get(_do_check_remote.remote(alg, config, a_name, o_name))
+
+    # Now test all observation spaces.
+    for o_name in observation_spaces_to_test.keys():
+        a_name = default_action_space
+        ray.get(_do_check_remote.remote(alg, config, a_name, o_name))
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/tf_utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/tf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32ad32d268658ff40b8846ab018c876e1f986c1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/tf_utils.py
@@ -0,0 +1,812 @@
+import logging
+from typing import Any, Callable, List, Optional, Type, TYPE_CHECKING, Union
+
+import gymnasium as gym
+import numpy as np
+import tree  # pip install dm_tree
+from gymnasium.spaces import Discrete, MultiDiscrete
+
+from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.numpy import SMALL_NUMBER
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.typing import (
+    LocalOptimizer,
+    ModelGradients,
+    NetworkType,
+    PartialAlgorithmConfigDict,
+    SpaceStruct,
+    TensorStructType,
+    TensorType,
+)
+
+if TYPE_CHECKING:
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+    from ray.rllib.core.learner.learner import ParamDict
+    from ray.rllib.policy.eager_tf_policy import EagerTFPolicy
+    from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
+    from ray.rllib.policy.tf_policy import TFPolicy
+
+logger = logging.getLogger(__name__)
+tf1, tf, tfv = try_import_tf()
+
+
+@PublicAPI
+def clip_gradients(
+    gradients_dict: "ParamDict",
+    *,
+    grad_clip: Optional[float] = None,
+    grad_clip_by: str,
+) -> Optional[float]:
+    """Performs gradient clipping on a grad-dict based on a clip value and clip mode.
+
+    Changes the provided gradient dict in place.
+
+    Args:
+        gradients_dict: The gradients dict, mapping str to gradient tensors.
+        grad_clip: The value to clip with. The way gradients are clipped is defined
+            by the `grad_clip_by` arg (see below).
+        grad_clip_by: One of 'value', 'norm', or 'global_norm'.
+
+    Returns:
+        If `grad_clip_by`="global_norm" and `grad_clip` is not None, returns the global
+        norm of all tensors, otherwise returns None.
+    """
+    # No clipping, return.
+    if grad_clip is None:
+        return
+
+    # Clip by value (each gradient individually).
+    if grad_clip_by == "value":
+        for k, v in gradients_dict.copy().items():
+            gradients_dict[k] = tf.clip_by_value(v, -grad_clip, grad_clip)
+
+    # Clip by L2-norm (per gradient tensor).
+    elif grad_clip_by == "norm":
+        for k, v in gradients_dict.copy().items():
+            gradients_dict[k] = tf.clip_by_norm(v, grad_clip)
+
+    # Clip by global L2-norm (across all gradient tensors).
+    else:
+        assert grad_clip_by == "global_norm"
+
+        clipped_grads, global_norm = tf.clip_by_global_norm(
+            list(gradients_dict.values()), grad_clip
+        )
+        for k, v in zip(gradients_dict.copy().keys(), clipped_grads):
+            gradients_dict[k] = v
+
+        # Return the computed global norm scalar.
+        return global_norm
+
+
+@PublicAPI
+def explained_variance(y: TensorType, pred: TensorType) -> TensorType:
+    """Computes the explained variance for a pair of labels and predictions.
+
+    The formula used is:
+    max(-1.0, 1.0 - (std(y - pred)^2 / std(y)^2))
+
+    Args:
+        y: The labels.
+        pred: The predictions.
+
+    Returns:
+        The explained variance given a pair of labels and predictions.
+    """
+    _, y_var = tf.nn.moments(y, axes=[0])
+    _, diff_var = tf.nn.moments(y - pred, axes=[0])
+    return tf.maximum(-1.0, 1 - (diff_var / (y_var + SMALL_NUMBER)))
+
+
+@PublicAPI
+def flatten_inputs_to_1d_tensor(
+    inputs: TensorStructType,
+    spaces_struct: Optional[SpaceStruct] = None,
+    time_axis: bool = False,
+) -> TensorType:
+    """Flattens arbitrary input structs according to the given spaces struct.
+
+    Returns a single 1D tensor resulting from the different input
+    components' values.
+
+    Thereby:
+    - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes
+    are not treated differently from other types of Boxes and get
+    flattened as well.
+    - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with
+    Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]].
+    - MultiDiscrete values are multi-one-hot'd, e.g. a batch of
+    [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in
+    [[1, 0,  0, 0, 1, 0, 0], [0, 1,  0, 0, 0, 0, 1]].
+
+    Args:
+        inputs: The inputs to be flattened.
+        spaces_struct: The structure of the spaces that behind the input
+        time_axis: Whether all inputs have a time-axis (after the batch axis).
+            If True, will keep not only the batch axis (0th), but the time axis
+            (1st) as-is and flatten everything from the 2nd axis up.
+
+    Returns:
+        A single 1D tensor resulting from concatenating all
+        flattened/one-hot'd input components. Depending on the time_axis flag,
+        the shape is (B, n) or (B, T, n).
+
+    .. testcode::
+        :skipif: True
+
+        # B=2
+        from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor
+        from gymnasium.spaces import Discrete, Box
+        out = flatten_inputs_to_1d_tensor(
+            {"a": [1, 0], "b": [[[0.0], [0.1]], [1.0], [1.1]]},
+            spaces_struct=dict(a=Discrete(2), b=Box(shape=(2, 1)))
+        )
+        print(out)
+
+        # B=2; T=2
+        out = flatten_inputs_to_1d_tensor(
+            ([[1, 0], [0, 1]],
+             [[[0.0, 0.1], [1.0, 1.1]], [[2.0, 2.1], [3.0, 3.1]]]),
+            spaces_struct=tuple([Discrete(2), Box(shape=(2, ))]),
+            time_axis=True
+        )
+        print(out)
+
+    .. testoutput::
+
+        [[0.0, 1.0,  0.0, 0.1], [1.0, 0.0,  1.0, 1.1]]  # B=2 n=4
+        [[[0.0, 1.0, 0.0, 0.1], [1.0, 0.0, 1.0, 1.1]],
+        [[1.0, 0.0, 2.0, 2.1], [0.0, 1.0, 3.0, 3.1]]]  # B=2 T=2 n=4
+    """
+
+    flat_inputs = tree.flatten(inputs)
+    flat_spaces = (
+        tree.flatten(spaces_struct)
+        if spaces_struct is not None
+        else [None] * len(flat_inputs)
+    )
+
+    B = None
+    T = None
+    out = []
+    for input_, space in zip(flat_inputs, flat_spaces):
+        input_ = tf.convert_to_tensor(input_)
+        shape = tf.shape(input_)
+        # Store batch and (if applicable) time dimension.
+        if B is None:
+            B = shape[0]
+            if time_axis:
+                T = shape[1]
+
+        # One-hot encoding.
+        if isinstance(space, Discrete):
+            if time_axis:
+                input_ = tf.reshape(input_, [B * T])
+            out.append(tf.cast(one_hot(input_, space), tf.float32))
+        elif isinstance(space, MultiDiscrete):
+            if time_axis:
+                input_ = tf.reshape(input_, [B * T, -1])
+            out.append(tf.cast(one_hot(input_, space), tf.float32))
+        # Flatten.
+        else:
+            if time_axis:
+                input_ = tf.reshape(input_, [B * T, -1])
+            else:
+                input_ = tf.reshape(input_, [B, -1])
+            out.append(tf.cast(input_, tf.float32))
+
+    merged = tf.concat(out, axis=-1)
+    # Restore the time-dimension, if applicable.
+    if time_axis:
+        merged = tf.reshape(merged, [B, T, -1])
+
+    return merged
+
+
+@PublicAPI
+def get_gpu_devices() -> List[str]:
+    """Returns a list of GPU device names, e.g. ["/gpu:0", "/gpu:1"].
+
+    Supports both tf1.x and tf2.x.
+
+    Returns:
+        List of GPU device names (str).
+    """
+    if tfv == 1:
+        from tensorflow.python.client import device_lib
+
+        devices = device_lib.list_local_devices()
+    else:
+        try:
+            devices = tf.config.list_physical_devices()
+        except Exception:
+            devices = tf.config.experimental.list_physical_devices()
+
+    # Expect "GPU", but also stuff like: "XLA_GPU".
+    return [d.name for d in devices if "GPU" in d.device_type]
+
+
+@PublicAPI
+def get_placeholder(
+    *,
+    space: Optional[gym.Space] = None,
+    value: Optional[Any] = None,
+    name: Optional[str] = None,
+    time_axis: bool = False,
+    flatten: bool = True,
+) -> "tf1.placeholder":
+    """Returns a tf1.placeholder object given optional hints, such as a space.
+
+    Note that the returned placeholder will always have a leading batch
+    dimension (None).
+
+    Args:
+        space: An optional gym.Space to hint the shape and dtype of the
+            placeholder.
+        value: An optional value to hint the shape and dtype of the
+            placeholder.
+        name: An optional name for the placeholder.
+        time_axis: Whether the placeholder should also receive a time
+            dimension (None).
+        flatten: Whether to flatten the given space into a plain Box space
+            and then create the placeholder from the resulting space.
+
+    Returns:
+        The tf1 placeholder.
+    """
+    from ray.rllib.models.catalog import ModelCatalog
+
+    if space is not None:
+        if isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple)):
+            if flatten:
+                return ModelCatalog.get_action_placeholder(space, None)
+            else:
+                return tree.map_structure_with_path(
+                    lambda path, component: get_placeholder(
+                        space=component,
+                        name=name + "." + ".".join([str(p) for p in path]),
+                    ),
+                    get_base_struct_from_space(space),
+                )
+        return tf1.placeholder(
+            shape=(None,) + ((None,) if time_axis else ()) + space.shape,
+            dtype=tf.float32 if space.dtype == np.float64 else space.dtype,
+            name=name,
+        )
+    else:
+        assert value is not None
+        shape = value.shape[1:]
+        return tf1.placeholder(
+            shape=(None,)
+            + ((None,) if time_axis else ())
+            + (shape if isinstance(shape, tuple) else tuple(shape.as_list())),
+            dtype=tf.float32 if value.dtype == np.float64 else value.dtype,
+            name=name,
+        )
+
+
+@PublicAPI
+def get_tf_eager_cls_if_necessary(
+    orig_cls: Type["TFPolicy"],
+    config: Union["AlgorithmConfig", PartialAlgorithmConfigDict],
+) -> Type[Union["TFPolicy", "EagerTFPolicy", "EagerTFPolicyV2"]]:
+    """Returns the corresponding tf-eager class for a given TFPolicy class.
+
+    Args:
+        orig_cls: The original TFPolicy class to get the corresponding tf-eager
+            class for.
+        config: The Algorithm config dict or AlgorithmConfig object.
+
+    Returns:
+        The tf eager policy class corresponding to the given TFPolicy class.
+    """
+    cls = orig_cls
+    framework = config.get("framework", "tf")
+
+    if framework in ["tf2", "tf"] and not tf1:
+        raise ImportError("Could not import tensorflow!")
+
+    if framework == "tf2":
+        if not tf1.executing_eagerly():
+            tf1.enable_eager_execution()
+        assert tf1.executing_eagerly()
+
+        from ray.rllib.policy.tf_policy import TFPolicy
+        from ray.rllib.policy.eager_tf_policy import EagerTFPolicy
+        from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
+
+        # Create eager-class (if not already one).
+        if hasattr(orig_cls, "as_eager") and not issubclass(orig_cls, EagerTFPolicy):
+            cls = orig_cls.as_eager()
+        # Could be some other type of policy or already
+        # eager-ized.
+        elif not issubclass(orig_cls, TFPolicy):
+            pass
+        else:
+            raise ValueError(
+                "This policy does not support eager execution: {}".format(orig_cls)
+            )
+
+        # Now that we know, policy is an eager one, add tracing, if necessary.
+        if config.get("eager_tracing") and issubclass(
+            cls, (EagerTFPolicy, EagerTFPolicyV2)
+        ):
+            cls = cls.with_tracing()
+    return cls
+
+
+@PublicAPI
+def huber_loss(x: TensorType, delta: float = 1.0) -> TensorType:
+    """Computes the huber loss for a given term and delta parameter.
+
+    Reference: https://en.wikipedia.org/wiki/Huber_loss
+    Note that the factor of 0.5 is implicitly included in the calculation.
+
+    Formula:
+        L = 0.5 * x^2  for small abs x (delta threshold)
+        L = delta * (abs(x) - 0.5*delta)  for larger abs x (delta threshold)
+
+    Args:
+        x: The input term, e.g. a TD error.
+        delta: The delta parmameter in the above formula.
+
+    Returns:
+        The Huber loss resulting from `x` and `delta`.
+    """
+    return tf.where(
+        tf.abs(x) < delta,  # for small x -> apply the Huber correction
+        tf.math.square(x) * 0.5,
+        delta * (tf.abs(x) - 0.5 * delta),
+    )
+
+
+@PublicAPI
+def l2_loss(x: TensorType) -> TensorType:
+    """Computes half the L2 norm over a tensor's values without the sqrt.
+
+    output = 0.5 * sum(x ** 2)
+
+    Args:
+        x: The input tensor.
+
+    Returns:
+        0.5 times the L2 norm over the given tensor's values (w/o sqrt).
+    """
+    return 0.5 * tf.reduce_sum(tf.pow(x, 2.0))
+
+
+@PublicAPI
+def make_tf_callable(
+    session_or_none: Optional["tf1.Session"], dynamic_shape: bool = False
+) -> Callable:
+    """Returns a function that can be executed in either graph or eager mode.
+
+    The function must take only positional args.
+
+    If eager is enabled, this will act as just a function. Otherwise, it
+    will build a function that executes a session run with placeholders
+    internally.
+
+    Args:
+        session_or_none: tf.Session if in graph mode, else None.
+        dynamic_shape: True if the placeholders should have a dynamic
+            batch dimension. Otherwise they will be fixed shape.
+
+    Returns:
+        A function that can be called in either eager or static-graph mode.
+    """
+
+    if tf.executing_eagerly():
+        assert session_or_none is None
+    else:
+        assert session_or_none is not None
+
+    def make_wrapper(fn):
+        # Static-graph mode: Create placeholders and make a session call each
+        # time the wrapped function is called. Returns the output of this
+        # session call.
+        if session_or_none is not None:
+            args_placeholders = []
+            kwargs_placeholders = {}
+
+            symbolic_out = [None]
+
+            def call(*args, **kwargs):
+                args_flat = []
+                for a in args:
+                    if type(a) is list:
+                        args_flat.extend(a)
+                    else:
+                        args_flat.append(a)
+                args = args_flat
+
+                # We have not built any placeholders yet: Do this once here,
+                # then reuse the same placeholders each time we call this
+                # function again.
+                if symbolic_out[0] is None:
+                    with session_or_none.graph.as_default():
+
+                        def _create_placeholders(path, value):
+                            if dynamic_shape:
+                                if len(value.shape) > 0:
+                                    shape = (None,) + value.shape[1:]
+                                else:
+                                    shape = ()
+                            else:
+                                shape = value.shape
+                            return tf1.placeholder(
+                                dtype=value.dtype,
+                                shape=shape,
+                                name=".".join([str(p) for p in path]),
+                            )
+
+                        placeholders = tree.map_structure_with_path(
+                            _create_placeholders, args
+                        )
+                        for ph in tree.flatten(placeholders):
+                            args_placeholders.append(ph)
+
+                        placeholders = tree.map_structure_with_path(
+                            _create_placeholders, kwargs
+                        )
+                        for k, ph in placeholders.items():
+                            kwargs_placeholders[k] = ph
+
+                        symbolic_out[0] = fn(*args_placeholders, **kwargs_placeholders)
+                feed_dict = dict(zip(args_placeholders, tree.flatten(args)))
+                tree.map_structure(
+                    lambda ph, v: feed_dict.__setitem__(ph, v),
+                    kwargs_placeholders,
+                    kwargs,
+                )
+                ret = session_or_none.run(symbolic_out[0], feed_dict)
+                return ret
+
+            return call
+        # Eager mode (call function as is).
+        else:
+            return fn
+
+    return make_wrapper
+
+
+# TODO (sven): Deprecate this function once we have moved completely to the Learner API.
+#  Replaced with `clip_gradients()`.
+@PublicAPI
+def minimize_and_clip(
+    optimizer: LocalOptimizer,
+    objective: TensorType,
+    var_list: List["tf.Variable"],
+    clip_val: float = 10.0,
+) -> ModelGradients:
+    """Computes, then clips gradients using objective, optimizer and var list.
+
+    Ensures the norm of the gradients for each variable is clipped to
+    `clip_val`.
+
+    Args:
+        optimizer: Either a shim optimizer (tf eager) containing a
+            tf.GradientTape under `self.tape` or a tf1 local optimizer
+            object.
+        objective: The loss tensor to calculate gradients on.
+        var_list: The list of tf.Variables to compute gradients over.
+        clip_val: The global norm clip value. Will clip around -clip_val and
+            +clip_val.
+
+    Returns:
+        The resulting model gradients (list or tuples of grads + vars)
+        corresponding to the input `var_list`.
+    """
+    # Accidentally passing values < 0.0 will break all gradients.
+    assert clip_val is None or clip_val > 0.0, clip_val
+
+    if tf.executing_eagerly():
+        tape = optimizer.tape
+        grads_and_vars = list(zip(list(tape.gradient(objective, var_list)), var_list))
+    else:
+        grads_and_vars = optimizer.compute_gradients(objective, var_list=var_list)
+
+    return [
+        (tf.clip_by_norm(g, clip_val) if clip_val is not None else g, v)
+        for (g, v) in grads_and_vars
+        if g is not None
+    ]
+
+
+@PublicAPI
+def one_hot(x: TensorType, space: gym.Space) -> TensorType:
+    """Returns a one-hot tensor, given and int tensor and a space.
+
+    Handles the MultiDiscrete case as well.
+
+    Args:
+        x: The input tensor.
+        space: The space to use for generating the one-hot tensor.
+
+    Returns:
+        The resulting one-hot tensor.
+
+    Raises:
+        ValueError: If the given space is not a discrete one.
+
+    .. testcode::
+        :skipif: True
+
+        import gymnasium as gym
+        import tensorflow as tf
+        from ray.rllib.utils.tf_utils import one_hot
+        x = tf.Variable([0, 3], dtype=tf.int32)  # batch-dim=2
+        # Discrete space with 4 (one-hot) slots per batch item.
+        s = gym.spaces.Discrete(4)
+        one_hot(x, s)
+
+    .. testoutput::
+
+        <tf.Tensor 'one_hot:0' shape=(2, 4) dtype=float32>
+
+    .. testcode::
+        :skipif: True
+
+        x = tf.Variable([[0, 1, 2, 3]], dtype=tf.int32)  # batch-dim=1
+        # MultiDiscrete space with 5 + 4 + 4 + 7 = 20 (one-hot) slots
+        # per batch item.
+        s = gym.spaces.MultiDiscrete([5, 4, 4, 7])
+        one_hot(x, s)
+
+    .. testoutput::
+
+        <tf.Tensor 'concat:0' shape=(1, 20) dtype=float32>
+    """
+    if isinstance(space, Discrete):
+        return tf.one_hot(x, space.n, dtype=tf.float32)
+    elif isinstance(space, MultiDiscrete):
+        if isinstance(space.nvec[0], np.ndarray):
+            nvec = np.ravel(space.nvec)
+            x = tf.reshape(x, (x.shape[0], -1))
+        else:
+            nvec = space.nvec
+        return tf.concat(
+            [tf.one_hot(x[:, i], n, dtype=tf.float32) for i, n in enumerate(nvec)],
+            axis=-1,
+        )
+    else:
+        raise ValueError("Unsupported space for `one_hot`: {}".format(space))
+
+
+@PublicAPI
+def reduce_mean_ignore_inf(x: TensorType, axis: Optional[int] = None) -> TensorType:
+    """Same as tf.reduce_mean() but ignores -inf values.
+
+    Args:
+        x: The input tensor to reduce mean over.
+        axis: The axis over which to reduce. None for all axes.
+
+    Returns:
+        The mean reduced inputs, ignoring inf values.
+    """
+    mask = tf.not_equal(x, tf.float32.min)
+    x_zeroed = tf.where(mask, x, tf.zeros_like(x))
+    return tf.math.reduce_sum(x_zeroed, axis) / tf.math.reduce_sum(
+        tf.cast(mask, tf.float32), axis
+    )
+
+
+@PublicAPI
+def scope_vars(
+    scope: Union[str, "tf1.VariableScope"], trainable_only: bool = False
+) -> List["tf.Variable"]:
+    """Get variables inside a given scope.
+
+    Args:
+        scope: Scope in which the variables reside.
+        trainable_only: Whether or not to return only the variables that were
+            marked as trainable.
+
+    Returns:
+        The list of variables in the given `scope`.
+    """
+    return tf1.get_collection(
+        tf1.GraphKeys.TRAINABLE_VARIABLES
+        if trainable_only
+        else tf1.GraphKeys.VARIABLES,
+        scope=scope if isinstance(scope, str) else scope.name,
+    )
+
+
+@PublicAPI
+def symlog(x: "tf.Tensor") -> "tf.Tensor":
+    """The symlog function as described in [1]:
+
+    [1] Mastering Diverse Domains through World Models - 2023
+    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+    https://arxiv.org/pdf/2301.04104v1.pdf
+    """
+    return tf.math.sign(x) * tf.math.log(tf.math.abs(x) + 1)
+
+
+@PublicAPI
+def inverse_symlog(y: "tf.Tensor") -> "tf.Tensor":
+    """Inverse of the `symlog` function as desribed in [1]:
+
+    [1] Mastering Diverse Domains through World Models - 2023
+    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+    https://arxiv.org/pdf/2301.04104v1.pdf
+    """
+    # To get to symlog inverse, we solve the symlog equation for x:
+    #     y = sign(x) * log(|x| + 1)
+    # <=> y / sign(x) = log(|x| + 1)
+    # <=> y =  log( x + 1) V x >= 0
+    #    -y =  log(-x + 1) V x <  0
+    # <=> exp(y)  =  x + 1  V x >= 0
+    #     exp(-y) = -x + 1  V x <  0
+    # <=> exp(y)  - 1 =  x   V x >= 0
+    #     exp(-y) - 1 = -x   V x <  0
+    # <=>  exp(y)  - 1 = x   V x >= 0 (if x >= 0, then y must also be >= 0)
+    #     -exp(-y) - 1 = x   V x <  0 (if x < 0, then y must also be < 0)
+    # <=> sign(y) * (exp(|y|) - 1) = x
+    return tf.math.sign(y) * (tf.math.exp(tf.math.abs(y)) - 1)
+
+
+@PublicAPI
+def two_hot(
+    value: "tf.Tensor",
+    num_buckets: int = 255,
+    lower_bound: float = -20.0,
+    upper_bound: float = 20.0,
+    dtype=None,
+):
+    """Returns a two-hot vector of dim=num_buckets with two entries that are non-zero.
+
+    See [1] for more details:
+    [1] Mastering Diverse Domains through World Models - 2023
+    D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+    https://arxiv.org/pdf/2301.04104v1.pdf
+
+    Entries in the vector represent equally sized buckets within some fixed range
+    (`lower_bound` to `upper_bound`).
+    Those entries not 0.0 at positions k and k+1 encode the actual `value` and sum
+    up to 1.0. They are the weights multiplied by the buckets values at k and k+1 for
+    retrieving `value`.
+
+    Example:
+        num_buckets=11
+        lower_bound=-5
+        upper_bound=5
+        value=2.5
+        -> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0]
+        -> [-5   -4   -3   -2   -1   0    1    2    3    4    5] (0.5*2 + 0.5*3=2.5)
+
+    Example:
+        num_buckets=5
+        lower_bound=-1
+        upper_bound=1
+        value=0.1
+        -> [0.0, 0.0, 0.8, 0.2, 0.0]
+        -> [-1  -0.5   0   0.5   1] (0.2*0.5 + 0.8*0=0.1)
+
+    Args:
+        value: The input tensor of shape (B,) to be two-hot encoded.
+        num_buckets: The number of buckets to two-hot encode into.
+        lower_bound: The lower bound value used for the encoding. If input values are
+            lower than this boundary, they will be encoded as `lower_bound`.
+        upper_bound: The upper bound value used for the encoding. If input values are
+            higher than this boundary, they will be encoded as `upper_bound`.
+
+    Returns:
+        The two-hot encoded tensor of shape (B, num_buckets).
+    """
+    # First make sure, values are clipped.
+    value = tf.clip_by_value(value, lower_bound, upper_bound)
+    # Tensor of batch indices: [0, B=batch size).
+    batch_indices = tf.cast(
+        tf.range(0, tf.shape(value)[0]),
+        dtype=dtype or tf.float32,
+    )
+    # Calculate the step deltas (how much space between each bucket's central value?).
+    bucket_delta = (upper_bound - lower_bound) / (num_buckets - 1)
+    # Compute the float indices (might be non-int numbers: sitting between two buckets).
+    idx = (-lower_bound + value) / bucket_delta
+    # k
+    k = tf.math.floor(idx)
+    # k+1
+    kp1 = tf.math.ceil(idx)
+    # In case k == kp1 (idx is exactly on the bucket boundary), move kp1 up by 1.0.
+    # Otherwise, this would result in a NaN in the returned two-hot tensor.
+    kp1 = tf.where(tf.equal(k, kp1), kp1 + 1.0, kp1)
+    # Iff `kp1` is one beyond our last index (because incoming value is larger than
+    # `upper_bound`), move it to one before k (kp1's weight is going to be 0.0 anyways,
+    # so it doesn't matter where it points to; we are just avoiding an index error
+    # with this).
+    kp1 = tf.where(tf.equal(kp1, num_buckets), kp1 - 2.0, kp1)
+    # The actual values found at k and k+1 inside the set of buckets.
+    values_k = lower_bound + k * bucket_delta
+    values_kp1 = lower_bound + kp1 * bucket_delta
+    # Compute the two-hot weights (adding up to 1.0) to use at index k and k+1.
+    weights_k = (value - values_kp1) / (values_k - values_kp1)
+    weights_kp1 = 1.0 - weights_k
+    # Compile a tensor of full paths (indices from batch index to feature index) to
+    # use for the scatter_nd op.
+    indices_k = tf.stack([batch_indices, k], -1)
+    indices_kp1 = tf.stack([batch_indices, kp1], -1)
+    indices = tf.concat([indices_k, indices_kp1], 0)
+    # The actual values (weights adding up to 1.0) to place at the computed indices.
+    updates = tf.concat([weights_k, weights_kp1], 0)
+    # Call the actual scatter update op, returning a zero-filled tensor, only changed
+    # at the given indices.
+    return tf.scatter_nd(
+        tf.cast(indices, tf.int32),
+        updates,
+        shape=(tf.shape(value)[0], num_buckets),
+    )
+
+
+@PublicAPI
+def update_target_network(
+    main_net: NetworkType,
+    target_net: NetworkType,
+    tau: float,
+) -> None:
+    """Updates a keras.Model target network using Polyak averaging.
+
+    new_target_net_weight = (
+        tau * main_net_weight + (1.0 - tau) * current_target_net_weight
+    )
+
+    Args:
+        main_net: The keras.Model to update from.
+        target_net: The target network to update.
+        tau: The tau value to use in the Polyak averaging formula.
+    """
+    for old_var, current_var in zip(target_net.variables, main_net.variables):
+        updated_var = tau * current_var + (1.0 - tau) * old_var
+        old_var.assign(updated_var)
+
+
+@PublicAPI
+def zero_logps_from_actions(actions: TensorStructType) -> TensorType:
+    """Helper function useful for returning dummy logp's (0) for some actions.
+
+    Args:
+        actions: The input actions. This can be any struct
+            of complex action components or a simple tensor of different
+            dimensions, e.g. [B], [B, 2], or {"a": [B, 4, 5], "b": [B]}.
+
+    Returns:
+        A 1D tensor of 0.0 (dummy logp's) matching the batch
+        dim of `actions` (shape=[B]).
+    """
+    # Need to flatten `actions` in case we have a complex action space.
+    # Take the 0th component to extract the batch dim.
+    action_component = tree.flatten(actions)[0]
+    logp_ = tf.zeros_like(action_component, dtype=tf.float32)
+    # Logp's should be single values (but with the same batch dim as
+    # `deterministic_actions` or `stochastic_actions`). In case
+    # actions are just [B], zeros_like works just fine here, but if
+    # actions are [B, ...], we have to reduce logp back to just [B].
+    while len(logp_.shape) > 1:
+        logp_ = logp_[:, 0]
+    return logp_
+
+
+@DeveloperAPI
+def warn_if_infinite_kl_divergence(
+    policy: Type["TFPolicy"], mean_kl: TensorType
+) -> None:
+    def print_warning():
+        logger.warning(
+            "KL divergence is non-finite, this will likely destabilize your model and"
+            " the training process. Action(s) in a specific state have near-zero"
+            " probability. This can happen naturally in deterministic environments"
+            " where the optimal policy has zero mass for a specific action. To fix this"
+            " issue, consider setting the coefficient for the KL loss term to zero or"
+            " increasing policy entropy."
+        )
+        return tf.constant(0.0)
+
+    if policy.loss_initialized():
+        tf.cond(
+            tf.math.is_inf(mean_kl),
+            false_fn=lambda: tf.constant(0.0),
+            true_fn=lambda: print_warning(),
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/threading.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/threading.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a4461dadbf69afa24ed8e9007fe326640dc7de
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/threading.py
@@ -0,0 +1,34 @@
+from typing import Callable
+
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+def with_lock(func: Callable) -> Callable:
+    """Use as decorator (@withlock) around object methods that need locking.
+
+    Note: The object must have a self._lock = threading.Lock() property.
+    Locking thus works on the object level (no two locked methods of the same
+    object can be called asynchronously).
+
+    Args:
+        func: The function to decorate/wrap.
+
+    Returns:
+        The wrapped (object-level locked) function.
+    """
+
+    def wrapper(self, *a, **k):
+        try:
+            with self._lock:
+                return func(self, *a, **k)
+        except AttributeError as e:
+            if "has no attribute '_lock'" in e.args[0]:
+                raise AttributeError(
+                    "Object {} must have a `self._lock` property (assigned "
+                    "to a threading.RLock() object in its "
+                    "constructor)!".format(self)
+                )
+            raise e
+
+    return wrapper
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/torch_utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3783d0583c5f9a2453dfc517e462c5d3eb503cd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/torch_utils.py
@@ -0,0 +1,726 @@
+import logging
+import os
+import warnings
+from typing import Dict, List, Optional, TYPE_CHECKING, Union
+
+import gymnasium as gym
+from gymnasium.spaces import Discrete, MultiDiscrete
+import numpy as np
+from packaging import version
+import tree  # pip install dm_tree
+
+from ray.rllib.models.repeated_values import RepeatedValues
+from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI, OldAPIStack
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import SMALL_NUMBER
+from ray.rllib.utils.typing import (
+    LocalOptimizer,
+    NetworkType,
+    SpaceStruct,
+    TensorStructType,
+    TensorType,
+)
+
+if TYPE_CHECKING:
+    from ray.rllib.core.learner.learner import ParamDict, ParamList
+    from ray.rllib.policy.torch_policy import TorchPolicy
+    from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
+
+logger = logging.getLogger(__name__)
+torch, nn = try_import_torch()
+
+# Limit values suitable for use as close to a -inf logit. These are useful
+# since -inf / inf cause NaNs during backprop.
+FLOAT_MIN = -3.4e38
+FLOAT_MAX = 3.4e38
+
+if torch:
+    TORCH_COMPILE_REQUIRED_VERSION = version.parse("2.0.0")
+else:
+    TORCH_COMPILE_REQUIRED_VERSION = ValueError(
+        "torch is not installed. " "TORCH_COMPILE_REQUIRED_VERSION is " "not defined."
+    )
+
+
+@OldAPIStack
+def apply_grad_clipping(
+    policy: "TorchPolicy", optimizer: LocalOptimizer, loss: TensorType
+) -> Dict[str, TensorType]:
+    """Applies gradient clipping to already computed grads inside `optimizer`.
+
+    Note: This function does NOT perform an analogous operation as
+    tf.clip_by_global_norm. It merely clips by norm (per gradient tensor) and
+    then computes the global norm across all given tensors (but without clipping
+    by that global norm).
+
+    Args:
+        policy: The TorchPolicy, which calculated `loss`.
+        optimizer: A local torch optimizer object.
+        loss: The torch loss tensor.
+
+    Returns:
+        An info dict containing the "grad_norm" key and the resulting clipped
+        gradients.
+    """
+    grad_gnorm = 0
+    if policy.config["grad_clip"] is not None:
+        clip_value = policy.config["grad_clip"]
+    else:
+        clip_value = np.inf
+
+    num_none_grads = 0
+    for param_group in optimizer.param_groups:
+        # Make sure we only pass params with grad != None into torch
+        # clip_grad_norm_. Would fail otherwise.
+        params = list(filter(lambda p: p.grad is not None, param_group["params"]))
+        if params:
+            # PyTorch clips gradients inplace and returns the norm before clipping
+            # We therefore need to compute grad_gnorm further down (fixes #4965)
+            global_norm = nn.utils.clip_grad_norm_(params, clip_value)
+
+            if isinstance(global_norm, torch.Tensor):
+                global_norm = global_norm.cpu().numpy()
+
+            grad_gnorm += min(global_norm, clip_value)
+        else:
+            num_none_grads += 1
+
+    # Note (Kourosh): grads could indeed be zero. This method should still return
+    # grad_gnorm in that case.
+    if num_none_grads == len(optimizer.param_groups):
+        # No grads available
+        return {}
+    return {"grad_gnorm": grad_gnorm}
+
+
+@PublicAPI
+def clip_gradients(
+    gradients_dict: "ParamDict",
+    *,
+    grad_clip: Optional[float] = None,
+    grad_clip_by: str = "value",
+) -> TensorType:
+    """Performs gradient clipping on a grad-dict based on a clip value and clip mode.
+
+    Changes the provided gradient dict in place.
+
+    Args:
+        gradients_dict: The gradients dict, mapping str to gradient tensors.
+        grad_clip: The value to clip with. The way gradients are clipped is defined
+            by the `grad_clip_by` arg (see below).
+        grad_clip_by: One of 'value', 'norm', or 'global_norm'.
+
+    Returns:
+        If `grad_clip_by`="global_norm" and `grad_clip` is not None, returns the global
+        norm of all tensors, otherwise returns None.
+    """
+    # No clipping, return.
+    if grad_clip is None:
+        return
+
+    # Clip by value (each gradient individually).
+    if grad_clip_by == "value":
+        for k, v in gradients_dict.copy().items():
+            gradients_dict[k] = (
+                None if v is None else torch.clip(v, -grad_clip, grad_clip)
+            )
+
+    # Clip by L2-norm (per gradient tensor).
+    elif grad_clip_by == "norm":
+        for k, v in gradients_dict.copy().items():
+            if v is not None:
+                # Compute the L2-norm of the gradient tensor.
+                norm = v.norm(2).nan_to_num(neginf=-10e8, posinf=10e8)
+                # Clip all the gradients.
+                if norm > grad_clip:
+                    v.mul_(grad_clip / norm)
+
+    # Clip by global L2-norm (across all gradient tensors).
+    else:
+        assert (
+            grad_clip_by == "global_norm"
+        ), f"`grad_clip_by` ({grad_clip_by}) must be one of [value|norm|global_norm]!"
+        gradients_list = list(gradients_dict.values())
+        total_norm = compute_global_norm(gradients_list)
+        if len(gradients_list) == 0:
+            return total_norm
+        # We do want the coefficient to be in between 0.0 and 1.0, therefore
+        # if the global_norm is smaller than the clip value, we use the clip value
+        # as normalization constant.
+        device = gradients_list[0].device
+        clip_coef = grad_clip / torch.maximum(
+            torch.tensor(grad_clip).to(device), total_norm + 1e-6
+        )
+        # Note: multiplying by the clamped coef is redundant when the coef is clamped to
+        # 1, but doing so avoids a `if clip_coef < 1:` conditional which can require a
+        # CPU <=> device synchronization when the gradients do not reside in CPU memory.
+        clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+        for g in gradients_list:
+            if g is not None:
+                g.detach().mul_(clip_coef_clamped.to(g.device))
+        return total_norm
+
+
+@PublicAPI
+def compute_global_norm(gradients_list: "ParamList") -> TensorType:
+    """Computes the global norm for a gradients dict.
+
+    Args:
+        gradients_list: The gradients list containing parameters.
+
+    Returns:
+        Returns the global norm of all tensors in `gradients_list`.
+    """
+    # Define the norm type to be L2.
+    norm_type = 2.0
+    # If we have no grads, return zero.
+    if len(gradients_list) == 0:
+        return torch.tensor(0.0)
+    device = gradients_list[0].device
+
+    # Compute the global norm.
+    total_norm = torch.norm(
+        torch.stack(
+            [
+                torch.norm(g.detach(), norm_type)
+                # Note, we want to avoid overflow in the norm computation, this does
+                # not affect the gradients themselves as we clamp by multiplying and
+                # not by overriding tensor values.
+                .nan_to_num(neginf=-10e8, posinf=10e8).to(device)
+                for g in gradients_list
+                if g is not None
+            ]
+        ),
+        norm_type,
+    ).nan_to_num(neginf=-10e8, posinf=10e8)
+    if torch.logical_or(total_norm.isnan(), total_norm.isinf()):
+        raise RuntimeError(
+            f"The total norm of order {norm_type} for gradients from "
+            "`parameters` is non-finite, so it cannot be clipped. "
+        )
+    # Return the global norm.
+    return total_norm
+
+
+@OldAPIStack
+def concat_multi_gpu_td_errors(
+    policy: Union["TorchPolicy", "TorchPolicyV2"]
+) -> Dict[str, TensorType]:
+    """Concatenates multi-GPU (per-tower) TD error tensors given TorchPolicy.
+
+    TD-errors are extracted from the TorchPolicy via its tower_stats property.
+
+    Args:
+        policy: The TorchPolicy to extract the TD-error values from.
+
+    Returns:
+        A dict mapping strings "td_error" and "mean_td_error" to the
+        corresponding concatenated and mean-reduced values.
+    """
+    td_error = torch.cat(
+        [
+            t.tower_stats.get("td_error", torch.tensor([0.0])).to(policy.device)
+            for t in policy.model_gpu_towers
+        ],
+        dim=0,
+    )
+    policy.td_error = td_error
+    return {
+        "td_error": td_error,
+        "mean_td_error": torch.mean(td_error),
+    }
+
+
+@PublicAPI
+def convert_to_torch_tensor(
+    x: TensorStructType,
+    device: Optional[str] = None,
+    pin_memory: bool = False,
+):
+    """Converts any struct to torch.Tensors.
+
+    Args:
+        x: Any (possibly nested) struct, the values in which will be
+            converted and returned as a new struct with all leaves converted
+            to torch tensors.
+        device: The device to create the tensor on.
+        pin_memory: If True, will call the `pin_memory()` method on the created tensors.
+
+    Returns:
+        Any: A new struct with the same structure as `x`, but with all
+        values converted to torch Tensor types. This does not convert possibly
+        nested elements that are None because torch has no representation for that.
+    """
+
+    def mapping(item):
+        if item is None:
+            # Torch has no representation for `None`, so we return None
+            return item
+
+        # Special handling of "Repeated" values.
+        if isinstance(item, RepeatedValues):
+            return RepeatedValues(
+                tree.map_structure(mapping, item.values), item.lengths, item.max_len
+            )
+
+        # Already torch tensor -> make sure it's on right device.
+        if torch.is_tensor(item):
+            tensor = item
+        # Numpy arrays.
+        elif isinstance(item, np.ndarray):
+            # Object type (e.g. info dicts in train batch): leave as-is.
+            # str type (e.g. agent_id in train batch): leave as-is.
+            if item.dtype == object or item.dtype.type is np.str_:
+                return item
+            # Non-writable numpy-arrays will cause PyTorch warning.
+            elif item.flags.writeable is False:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    tensor = torch.from_numpy(item)
+            # Already numpy: Wrap as torch tensor.
+            else:
+                tensor = torch.from_numpy(item)
+        # Everything else: Convert to numpy, then wrap as torch tensor.
+        else:
+            tensor = torch.from_numpy(np.asarray(item))
+
+        # Floatify all float64 tensors (but leave float16 as-is).
+        if tensor.is_floating_point() and str(tensor.dtype) != "torch.float16":
+            tensor = tensor.float()
+
+        # Pin the tensor's memory (for faster transfer to GPU later).
+        if pin_memory and torch.cuda.is_available():
+            tensor.pin_memory()
+
+        return tensor if device is None else tensor.to(device)
+
+    return tree.map_structure(mapping, x)
+
+
+@PublicAPI
+def copy_torch_tensors(x: TensorStructType, device: Optional[str] = None):
+    """Creates a copy of `x` and makes deep copies torch.Tensors in x.
+
+    Also moves the copied tensors to the specified device (if not None).
+
+    Note if an object in x is not a torch.Tensor, it will be shallow-copied.
+
+    Args:
+        x : Any (possibly nested) struct possibly containing torch.Tensors.
+        device : The device to move the tensors to.
+
+    Returns:
+        Any: A new struct with the same structure as `x`, but with all
+            torch.Tensors deep-copied and moved to the specified device.
+
+    """
+
+    def mapping(item):
+        if isinstance(item, torch.Tensor):
+            return (
+                torch.clone(item.detach())
+                if device is None
+                else item.detach().to(device)
+            )
+        else:
+            return item
+
+    return tree.map_structure(mapping, x)
+
+
+@PublicAPI
+def explained_variance(y: TensorType, pred: TensorType) -> TensorType:
+    """Computes the explained variance for a pair of labels and predictions.
+
+    The formula used is:
+    max(-1.0, 1.0 - (std(y - pred)^2 / std(y)^2))
+
+    Args:
+        y: The labels.
+        pred: The predictions.
+
+    Returns:
+        The explained variance given a pair of labels and predictions.
+    """
+    y_var = torch.var(y, dim=[0])
+    diff_var = torch.var(y - pred, dim=[0])
+    min_ = torch.tensor([-1.0]).to(pred.device)
+    return torch.max(min_, 1 - (diff_var / (y_var + SMALL_NUMBER)))[0]
+
+
+@PublicAPI
+def flatten_inputs_to_1d_tensor(
+    inputs: TensorStructType,
+    spaces_struct: Optional[SpaceStruct] = None,
+    time_axis: bool = False,
+) -> TensorType:
+    """Flattens arbitrary input structs according to the given spaces struct.
+
+    Returns a single 1D tensor resulting from the different input
+    components' values.
+
+    Thereby:
+    - Boxes (any shape) get flattened to (B, [T]?, -1). Note that image boxes
+    are not treated differently from other types of Boxes and get
+    flattened as well.
+    - Discrete (int) values are one-hot'd, e.g. a batch of [1, 0, 3] (B=3 with
+    Discrete(4) space) results in [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]].
+    - MultiDiscrete values are multi-one-hot'd, e.g. a batch of
+    [[0, 2], [1, 4]] (B=2 with MultiDiscrete([2, 5]) space) results in
+    [[1, 0,  0, 0, 1, 0, 0], [0, 1,  0, 0, 0, 0, 1]].
+
+    Args:
+        inputs: The inputs to be flattened.
+        spaces_struct: The structure of the spaces that behind the input
+        time_axis: Whether all inputs have a time-axis (after the batch axis).
+            If True, will keep not only the batch axis (0th), but the time axis
+            (1st) as-is and flatten everything from the 2nd axis up.
+
+    Returns:
+        A single 1D tensor resulting from concatenating all
+        flattened/one-hot'd input components. Depending on the time_axis flag,
+        the shape is (B, n) or (B, T, n).
+
+    .. testcode::
+
+        from gymnasium.spaces import Discrete, Box
+        from ray.rllib.utils.torch_utils import flatten_inputs_to_1d_tensor
+        import torch
+        struct = {
+            "a": np.array([1, 3]),
+            "b": (
+                np.array([[1.0, 2.0], [4.0, 5.0]]),
+                np.array(
+                    [[[8.0], [7.0]], [[5.0], [4.0]]]
+                ),
+            ),
+                "c": {
+                    "cb": np.array([1.0, 2.0]),
+                },
+        }
+        struct_torch = tree.map_structure(lambda s: torch.from_numpy(s), struct)
+        spaces = dict(
+            {
+                "a": gym.spaces.Discrete(4),
+                "b": (gym.spaces.Box(-1.0, 10.0, (2,)), gym.spaces.Box(-1.0, 1.0, (2,
+                        1))),
+                "c": dict(
+                    {
+                        "cb": gym.spaces.Box(-1.0, 1.0, ()),
+                    }
+                ),
+            }
+        )
+        print(flatten_inputs_to_1d_tensor(struct_torch, spaces_struct=spaces))
+
+    .. testoutput::
+
+        tensor([[0., 1., 0., 0., 1., 2., 8., 7., 1.],
+                [0., 0., 0., 1., 4., 5., 5., 4., 2.]])
+
+    """
+
+    flat_inputs = tree.flatten(inputs)
+    flat_spaces = (
+        tree.flatten(spaces_struct)
+        if spaces_struct is not None
+        else [None] * len(flat_inputs)
+    )
+
+    B = None
+    T = None
+    out = []
+    for input_, space in zip(flat_inputs, flat_spaces):
+        # Store batch and (if applicable) time dimension.
+        if B is None:
+            B = input_.shape[0]
+            if time_axis:
+                T = input_.shape[1]
+
+        # One-hot encoding.
+        if isinstance(space, Discrete):
+            if time_axis:
+                input_ = torch.reshape(input_, [B * T])
+            out.append(one_hot(input_, space).float())
+        # Multi one-hot encoding.
+        elif isinstance(space, MultiDiscrete):
+            if time_axis:
+                input_ = torch.reshape(input_, [B * T, -1])
+            out.append(one_hot(input_, space).float())
+        # Box: Flatten.
+        else:
+            if time_axis:
+                input_ = torch.reshape(input_, [B * T, -1])
+            else:
+                input_ = torch.reshape(input_, [B, -1])
+            out.append(input_.float())
+
+    merged = torch.cat(out, dim=-1)
+    # Restore the time-dimension, if applicable.
+    if time_axis:
+        merged = torch.reshape(merged, [B, T, -1])
+
+    return merged
+
+
+@PublicAPI
+def global_norm(tensors: List[TensorType]) -> TensorType:
+    """Returns the global L2 norm over a list of tensors.
+
+    output = sqrt(SUM(t ** 2 for t in tensors)),
+        where SUM reduces over all tensors and over all elements in tensors.
+
+    Args:
+        tensors: The list of tensors to calculate the global norm over.
+
+    Returns:
+        The global L2 norm over the given tensor list.
+    """
+    # List of single tensors' L2 norms: SQRT(SUM(xi^2)) over all xi in tensor.
+    single_l2s = [torch.pow(torch.sum(torch.pow(t, 2.0)), 0.5) for t in tensors]
+    # Compute global norm from all single tensors' L2 norms.
+    return torch.pow(sum(torch.pow(l2, 2.0) for l2 in single_l2s), 0.5)
+
+
+@OldAPIStack
+def huber_loss(x: TensorType, delta: float = 1.0) -> TensorType:
+    """Computes the huber loss for a given term and delta parameter.
+
+    Reference: https://en.wikipedia.org/wiki/Huber_loss
+    Note that the factor of 0.5 is implicitly included in the calculation.
+
+    Formula:
+        L = 0.5 * x^2  for small abs x (delta threshold)
+        L = delta * (abs(x) - 0.5*delta)  for larger abs x (delta threshold)
+
+    Args:
+        x: The input term, e.g. a TD error.
+        delta: The delta parmameter in the above formula.
+
+    Returns:
+        The Huber loss resulting from `x` and `delta`.
+    """
+    return torch.where(
+        torch.abs(x) < delta,
+        torch.pow(x, 2.0) * 0.5,
+        delta * (torch.abs(x) - 0.5 * delta),
+    )
+
+
+@OldAPIStack
+def l2_loss(x: TensorType) -> TensorType:
+    """Computes half the L2 norm over a tensor's values without the sqrt.
+
+    output = 0.5 * sum(x ** 2)
+
+    Args:
+        x: The input tensor.
+
+    Returns:
+        0.5 times the L2 norm over the given tensor's values (w/o sqrt).
+    """
+    return 0.5 * torch.sum(torch.pow(x, 2.0))
+
+
+@PublicAPI
+def one_hot(x: TensorType, space: gym.Space) -> TensorType:
+    """Returns a one-hot tensor, given and int tensor and a space.
+
+    Handles the MultiDiscrete case as well.
+
+    Args:
+        x: The input tensor.
+        space: The space to use for generating the one-hot tensor.
+
+    Returns:
+        The resulting one-hot tensor.
+
+    Raises:
+        ValueError: If the given space is not a discrete one.
+
+    .. testcode::
+
+        import torch
+        import gymnasium as gym
+        from ray.rllib.utils.torch_utils import one_hot
+        x = torch.IntTensor([0, 3])  # batch-dim=2
+        # Discrete space with 4 (one-hot) slots per batch item.
+        s = gym.spaces.Discrete(4)
+        print(one_hot(x, s))
+        x = torch.IntTensor([[0, 1, 2, 3]])  # batch-dim=1
+        # MultiDiscrete space with 5 + 4 + 4 + 7 = 20 (one-hot) slots
+        # per batch item.
+        s = gym.spaces.MultiDiscrete([5, 4, 4, 7])
+        print(one_hot(x, s))
+
+    .. testoutput::
+
+        tensor([[1, 0, 0, 0],
+                [0, 0, 0, 1]])
+        tensor([[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]])
+    """
+    if isinstance(space, Discrete):
+        return nn.functional.one_hot(x.long(), space.n)
+    elif isinstance(space, MultiDiscrete):
+        if isinstance(space.nvec[0], np.ndarray):
+            nvec = np.ravel(space.nvec)
+            x = x.reshape(x.shape[0], -1)
+        else:
+            nvec = space.nvec
+        return torch.cat(
+            [nn.functional.one_hot(x[:, i].long(), n) for i, n in enumerate(nvec)],
+            dim=-1,
+        )
+    else:
+        raise ValueError("Unsupported space for `one_hot`: {}".format(space))
+
+
+@PublicAPI
+def reduce_mean_ignore_inf(x: TensorType, axis: Optional[int] = None) -> TensorType:
+    """Same as torch.mean() but ignores -inf values.
+
+    Args:
+        x: The input tensor to reduce mean over.
+        axis: The axis over which to reduce. None for all axes.
+
+    Returns:
+        The mean reduced inputs, ignoring inf values.
+    """
+    mask = torch.ne(x, float("-inf"))
+    x_zeroed = torch.where(mask, x, torch.zeros_like(x))
+    return torch.sum(x_zeroed, axis) / torch.sum(mask.float(), axis)
+
+
+@PublicAPI
+def sequence_mask(
+    lengths: TensorType,
+    maxlen: Optional[int] = None,
+    dtype=None,
+    time_major: bool = False,
+) -> TensorType:
+    """Offers same behavior as tf.sequence_mask for torch.
+
+    Thanks to Dimitris Papatheodorou
+    (https://discuss.pytorch.org/t/pytorch-equivalent-for-tf-sequence-mask/
+    39036).
+
+    Args:
+        lengths: The tensor of individual lengths to mask by.
+        maxlen: The maximum length to use for the time axis. If None, use
+            the max of `lengths`.
+        dtype: The torch dtype to use for the resulting mask.
+        time_major: Whether to return the mask as [B, T] (False; default) or
+            as [T, B] (True).
+
+    Returns:
+         The sequence mask resulting from the given input and parameters.
+    """
+    # If maxlen not given, use the longest lengths in the `lengths` tensor.
+    if maxlen is None:
+        maxlen = lengths.max()
+
+    mask = torch.ones(tuple(lengths.shape) + (int(maxlen),))
+
+    mask = ~(mask.to(lengths.device).cumsum(dim=1).t() > lengths)
+    # Time major transformation.
+    if not time_major:
+        mask = mask.t()
+
+    # By default, set the mask to be boolean.
+    mask.type(dtype or torch.bool)
+
+    return mask
+
+
+@PublicAPI
+def update_target_network(
+    main_net: NetworkType,
+    target_net: NetworkType,
+    tau: float,
+) -> None:
+    """Updates a torch.nn.Module target network using Polyak averaging.
+
+    .. code-block:: text
+
+        new_target_net_weight = (
+            tau * main_net_weight + (1.0 - tau) * current_target_net_weight
+        )
+
+    Args:
+        main_net: The nn.Module to update from.
+        target_net: The target network to update.
+        tau: The tau value to use in the Polyak averaging formula.
+    """
+    # Get the current parameters from the Q network.
+    state_dict = main_net.state_dict()
+    # Use here Polyak averaging.
+    new_state_dict = {
+        k: tau * state_dict[k] + (1 - tau) * v
+        for k, v in target_net.state_dict().items()
+    }
+    # Apply the new parameters to the target Q network.
+    target_net.load_state_dict(new_state_dict)
+
+
+@DeveloperAPI
+def warn_if_infinite_kl_divergence(
+    policy: "TorchPolicy",
+    kl_divergence: TensorType,
+) -> None:
+    if policy.loss_initialized() and kl_divergence.isinf():
+        logger.warning(
+            "KL divergence is non-finite, this will likely destabilize your model and"
+            " the training process. Action(s) in a specific state have near-zero"
+            " probability. This can happen naturally in deterministic environments"
+            " where the optimal policy has zero mass for a specific action. To fix this"
+            " issue, consider setting the coefficient for the KL loss term to zero or"
+            " increasing policy entropy."
+        )
+
+
+@PublicAPI
+def set_torch_seed(seed: Optional[int] = None) -> None:
+    """Sets the torch random seed to the given value.
+
+    Args:
+        seed: The seed to use or None for no seeding.
+    """
+    if seed is not None and torch:
+        torch.manual_seed(seed)
+        # See https://github.com/pytorch/pytorch/issues/47672.
+        cuda_version = torch.version.cuda
+        if cuda_version is not None and float(torch.version.cuda) >= 10.2:
+            os.environ["CUBLAS_WORKSPACE_CONFIG"] = "4096:8"
+        else:
+            # Not all Operations support this.
+            torch.use_deterministic_algorithms(True)
+        # This is only for Convolution no problem.
+        torch.backends.cudnn.deterministic = True
+
+
+@PublicAPI
+def softmax_cross_entropy_with_logits(
+    logits: TensorType,
+    labels: TensorType,
+) -> TensorType:
+    """Same behavior as tf.nn.softmax_cross_entropy_with_logits.
+
+    Args:
+        x: The input predictions.
+        labels: The labels corresponding to `x`.
+
+    Returns:
+        The resulting softmax cross-entropy given predictions and labels.
+    """
+    return torch.sum(-labels * nn.functional.log_softmax(logits, -1), -1)
+
+
+def _dynamo_is_available():
+    # This only works if torch._dynamo is available
+    try:
+        # TODO(Artur): Remove this once torch._dynamo is available on CI
+        import torch._dynamo as dynamo  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/utils/typing.py b/.venv/lib/python3.11/site-packages/ray/rllib/utils/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b4cd2f41f0078e49bdee33594f4632b99f654da
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/utils/typing.py
@@ -0,0 +1,310 @@
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Hashable,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+import numpy as np
+import gymnasium as gym
+
+from ray.rllib.utils.annotations import OldAPIStack
+
+if TYPE_CHECKING:
+    from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+    from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+    from ray.rllib.env.env_context import EnvContext
+    from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
+    from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+    from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2
+    from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
+    from ray.rllib.policy.policy import PolicySpec
+    from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
+    from ray.rllib.policy.view_requirement import ViewRequirement
+    from ray.rllib.utils import try_import_jax, try_import_tf, try_import_torch
+
+    _, tf, _ = try_import_tf()
+    torch, _ = try_import_torch()
+    jax, _ = try_import_jax()
+    jnp = None
+    if jax is not None:
+        jnp = jax.numpy
+
+# Represents a generic tensor type.
+# This could be an np.ndarray, tf.Tensor, or a torch.Tensor.
+TensorType = Union[np.array, "jnp.ndarray", "tf.Tensor", "torch.Tensor"]
+
+# Either a plain tensor, or a dict or tuple of tensors (or StructTensors).
+TensorStructType = Union[TensorType, dict, tuple]
+
+# A shape of a tensor.
+TensorShape = Union[Tuple[int], List[int]]
+
+# A neural network
+NetworkType = Union["torch.nn.Module", "tf.keras.Model"]
+
+# An RLModule spec (single-agent or multi-agent).
+RLModuleSpecType = Union["RLModuleSpec", "MultiRLModuleSpec"]
+
+# A state dict of an RLlib component (e.g. EnvRunner, Learner, RLModule).
+StateDict = Dict[str, Any]
+
+# Represents a fully filled out config of a Algorithm class.
+# Note: Policy config dicts are usually the same as AlgorithmConfigDict, but
+# parts of it may sometimes be altered in e.g. a multi-agent setup,
+# where we have >1 Policies in the same Algorithm.
+AlgorithmConfigDict = dict  # @OldAPIStack
+
+# An algorithm config dict that only has overrides. It needs to be combined with
+# the default algorithm config to be used.
+PartialAlgorithmConfigDict = dict  # @OldAPIStack
+
+# Represents the model config sub-dict of the algo config that is passed to
+# the model catalog.
+ModelConfigDict = dict  # @OldAPIStack
+
+# Conv2D configuration format.
+# Each entry in the outer list represents one Conv2D layer.
+# Each inner list has the format: [num_output_filters, kernel, stride], where kernel
+# and stride may be single ints (width and height are the same) or 2-tuples (int, int)
+# for width and height (different values).
+ConvFilterSpec = List[
+    Tuple[int, Union[int, Tuple[int, int]], Union[int, Tuple[int, int]]]
+]
+
+# Objects that can be created through the `from_config()` util method
+# need a config dict with a "type" key, a class path (str), or a type directly.
+FromConfigSpec = Union[Dict[str, Any], type, str]
+
+# Represents the env_config sub-dict of the algo config that is passed to
+# the env constructor.
+EnvConfigDict = dict
+
+# Represents an environment id. These could be:
+# - An int index for a sub-env within a vectorized env.
+# - An external env ID (str), which changes(!) each episode.
+EnvID = Union[int, str]
+
+# Represents a BaseEnv, MultiAgentEnv, ExternalEnv, ExternalMultiAgentEnv,
+# VectorEnv, gym.Env, or ActorHandle.
+# TODO (sven): Specify this type more strictly (it should just be gym.Env).
+EnvType = Union[Any, gym.Env]
+
+# A callable, taking a EnvContext object
+# (config dict + properties: `worker_index`, `vector_index`, `num_workers`,
+# and `remote`) and returning an env object (or None if no env is used).
+EnvCreator = Callable[["EnvContext"], Optional[EnvType]]
+
+# Represents a generic identifier for an agent (e.g., "agent1").
+AgentID = Any
+
+# Represents a generic identifier for a policy (e.g., "pol1").
+PolicyID = str  # @OldAPIStack
+# Represents a generic identifier for a (single-agent) RLModule.
+ModuleID = str
+
+# Type of the config.policies dict for multi-agent training.
+MultiAgentPolicyConfigDict = Dict[PolicyID, "PolicySpec"]  # @OldAPIStack
+
+# A new stack Episode type: Either single-agent or multi-agent.
+EpisodeType = Union["SingleAgentEpisode", "MultiAgentEpisode"]
+
+# Is Policy to train callable.
+# @OldAPIStack
+IsPolicyToTrain = Callable[[PolicyID, Optional["MultiAgentBatch"]], bool]
+
+# Agent to module mapping and should-module-be-updated.
+AgentToModuleMappingFn = Callable[[AgentID, EpisodeType], ModuleID]
+ShouldModuleBeUpdatedFn = Union[
+    Sequence[ModuleID],
+    Callable[[ModuleID, Optional["MultiAgentBatch"]], bool],
+]
+
+# State dict of a Policy, mapping strings (e.g. "weights") to some state
+# data (TensorStructType).
+PolicyState = Dict[str, TensorStructType]  # @OldAPIStack
+
+# Any tf Policy type (static-graph or eager Policy).
+TFPolicyV2Type = Type[Union["DynamicTFPolicyV2", "EagerTFPolicyV2"]]  # @OldAPIStack
+
+# Represents an episode id (old and new API stack).
+EpisodeID = Union[int, str]
+
+# Represents an "unroll" (maybe across different sub-envs in a vector env).
+UnrollID = int  # @OldAPIStack
+
+# A dict keyed by agent ids, e.g. {"agent-1": value}.
+MultiAgentDict = Dict[AgentID, Any]
+
+# A dict keyed by env ids that contain further nested dictionaries keyed by
+# agent ids. e.g., {"env-1": {"agent-1": value}}.
+MultiEnvDict = Dict[EnvID, MultiAgentDict]
+
+# Represents an observation returned from the env.
+EnvObsType = Any
+
+# Represents an action passed to the env.
+EnvActionType = Any
+
+# Info dictionary returned by calling `reset()` or `step()` on `gymnasium.Env`
+# instances. Might be an empty dict.
+EnvInfoDict = dict
+
+# Represents a File object
+FileType = Any
+
+# Represents a ViewRequirements dict mapping column names (str) to
+# ViewRequirement objects.
+ViewRequirementsDict = Dict[str, "ViewRequirement"]  # @OldAPIStack
+
+# Represents the result dict returned by Algorithm.train() and algorithm components,
+# such as EnvRunners, LearnerGroup, etc.. Also, the MetricsLogger used by all these
+# components returns this upon its `reduce()` method call, so a ResultDict can further
+# be accumulated (and reduced again) by downstream components.
+ResultDict = Dict
+
+# A tf or torch local optimizer object.
+LocalOptimizer = Union["torch.optim.Optimizer", "tf.keras.optimizers.Optimizer"]
+Optimizer = LocalOptimizer
+Param = Union["torch.Tensor", "tf.Variable"]
+ParamRef = Hashable
+ParamDict = Dict[ParamRef, Param]
+ParamList = List[Param]
+
+# A single learning rate or a learning rate schedule (list of sub-lists, each of
+# the format: [ts (int), lr_to_reach_by_ts (float)]).
+LearningRateOrSchedule = Union[
+    float,
+    List[List[Union[int, float]]],
+    List[Tuple[int, Union[int, float]]],
+]
+
+# Dict of tensors returned by compute gradients on the policy, e.g.,
+# {"td_error": [...], "learner_stats": {"vf_loss": ..., ...}}, for multi-agent,
+# {"policy1": {"learner_stats": ..., }, "policy2": ...}.
+GradInfoDict = dict
+
+# Dict of learner stats returned by compute gradients on the policy, e.g.,
+# {"vf_loss": ..., ...}. This will always be nested under the "learner_stats"
+# key(s) of a GradInfoDict. In the multi-agent case, this will be keyed by
+# policy id.
+LearnerStatsDict = dict
+
+# List of grads+var tuples (tf) or list of gradient tensors (torch)
+# representing model gradients and returned by compute_gradients().
+ModelGradients = Union[List[Tuple[TensorType, TensorType]], List[TensorType]]
+
+# Type of dict returned by get_weights() representing model weights.
+ModelWeights = dict
+
+# An input dict used for direct ModelV2 calls.
+ModelInputDict = Dict[str, TensorType]
+
+# Some kind of sample batch.
+SampleBatchType = Union["SampleBatch", "MultiAgentBatch", Dict[str, Any]]
+
+# A (possibly nested) space struct: Either a gym.spaces.Space or a
+# (possibly nested) dict|tuple of gym.space.Spaces.
+SpaceStruct = Union[gym.spaces.Space, dict, tuple]
+
+# A list of batches of RNN states.
+# Each item in this list has dimension [B, S] (S=state vector size)
+StateBatches = List[List[Any]]  # @OldAPIStack
+
+# Format of data output from policy forward pass.
+# __sphinx_doc_begin_policy_output_type__
+PolicyOutputType = Tuple[TensorStructType, StateBatches, Dict]  # @OldAPIStack
+# __sphinx_doc_end_policy_output_type__
+
+
+# __sphinx_doc_begin_agent_connector_data_type__
+@OldAPIStack
+class AgentConnectorDataType:
+    """Data type that is fed into and yielded from agent connectors.
+
+    Args:
+        env_id: ID of the environment.
+        agent_id: ID to help identify the agent from which the data is received.
+        data: A payload (``data``). With RLlib's default sampler, the payload
+            is a dictionary of arbitrary data columns (obs, rewards, terminateds,
+            truncateds, etc).
+    """
+
+    def __init__(self, env_id: str, agent_id: str, data: Any):
+        self.env_id = env_id
+        self.agent_id = agent_id
+        self.data = data
+
+
+# __sphinx_doc_end_agent_connector_data_type__
+
+
+# __sphinx_doc_begin_action_connector_output__
+@OldAPIStack
+class ActionConnectorDataType:
+    """Data type that is fed into and yielded from agent connectors.
+
+    Args:
+        env_id: ID of the environment.
+        agent_id: ID to help identify the agent from which the data is received.
+        input_dict: Input data that was passed into the policy.
+            Sometimes output must be adapted based on the input, for example
+            action masking. So the entire input data structure is provided here.
+        output: An object of PolicyOutputType. It is is composed of the
+            action output, the internal state output, and additional data fetches.
+
+    """
+
+    def __init__(
+        self,
+        env_id: str,
+        agent_id: str,
+        input_dict: TensorStructType,
+        output: PolicyOutputType,
+    ):
+        self.env_id = env_id
+        self.agent_id = agent_id
+        self.input_dict = input_dict
+        self.output = output
+
+
+# __sphinx_doc_end_action_connector_output__
+
+
+# __sphinx_doc_begin_agent_connector_output__
+@OldAPIStack
+class AgentConnectorsOutput:
+    """Final output data type of agent connectors.
+
+    Args are populated depending on the AgentConnector settings.
+    The branching happens in ViewRequirementAgentConnector.
+
+    Args:
+        raw_dict: The raw input dictionary that sampler can use to
+            build episodes and training batches.
+            This raw dict also gets passed into ActionConnectors in case
+            it contains data useful for action adaptation (e.g. action masks).
+        sample_batch: The SampleBatch that can be immediately used for
+            querying the policy for next action.
+    """
+
+    def __init__(
+        self, raw_dict: Dict[str, TensorStructType], sample_batch: "SampleBatch"
+    ):
+        self.raw_dict = raw_dict
+        self.sample_batch = sample_batch
+
+
+# __sphinx_doc_end_agent_connector_output__
+
+
+# Generic type var.
+T = TypeVar("T")