diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56d9db36f3328edaede6e2d16b70ffe343bb9a1b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99f853cceaf3384012edec5959d10dd3748d205b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_torch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_torch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..699fa476eacc561b5e114ef158d0dbb5280fef27
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_torch.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c3270e6ba6fc4c3d8e77f7fbf841774a3b7e230
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/impala_torch_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/impala_torch_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e2a3e60f7a2ebea7f8e97803a4619697b5284cc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/impala_torch_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/vtrace_torch_v2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/vtrace_torch_v2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e1c120d6fb3471431e7d711b5b628cbde4ce1b7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/vtrace_torch_v2.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/impala_torch_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/impala_torch_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..256e3b48fb79f217ec66a9a9fdf7ec7a9a0ca6ae
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/impala_torch_learner.py
@@ -0,0 +1,164 @@
+from typing import Dict
+
+from ray.rllib.algorithms.impala.impala import IMPALAConfig
+from ray.rllib.algorithms.impala.impala_learner import IMPALALearner
+from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import (
+    vtrace_torch,
+    make_time_major,
+)
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.learner import ENTROPY_KEY
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModuleID, TensorType
+
+torch, nn = try_import_torch()
+
+
+class IMPALATorchLearner(IMPALALearner, TorchLearner):
+    """Implements the IMPALA loss function in torch."""
+
+    @override(TorchLearner)
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: IMPALAConfig,
+        batch: Dict,
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        module = self.module[module_id].unwrapped()
+
+        # TODO (sven): Now that we do the +1ts trick to be less vulnerable about
+        #  bootstrap values at the end of rollouts in the new stack, we might make
+        #  this a more flexible, configurable parameter for users, e.g.
+        #  `v_trace_seq_len` (independent of `rollout_fragment_length`). Separation
+        #  of concerns (sampling vs learning).
+        rollout_frag_or_episode_len = config.get_rollout_fragment_length()
+        recurrent_seq_len = batch.get("seq_lens")
+
+        loss_mask = batch[Columns.LOSS_MASK].float()
+        loss_mask_time_major = make_time_major(
+            loss_mask,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        size_loss_mask = torch.sum(loss_mask)
+
+        # Behavior actions logp and target actions logp.
+        behaviour_actions_logp = batch[Columns.ACTION_LOGP]
+        target_policy_dist = module.get_train_action_dist_cls().from_logits(
+            fwd_out[Columns.ACTION_DIST_INPUTS]
+        )
+        target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS])
+
+        # Values and bootstrap values.
+        values = module.compute_values(
+            batch, embeddings=fwd_out.get(Columns.EMBEDDINGS)
+        )
+        values_time_major = make_time_major(
+            values,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        assert Columns.VALUES_BOOTSTRAPPED not in batch
+        # Use as bootstrap values the vf-preds in the next "batch row", except
+        # for the very last row (which doesn't have a next row), for which the
+        # bootstrap value does not matter b/c it has a +1ts value at its end
+        # anyways. So we chose an arbitrary item (for simplicity of not having to
+        # move new data to the device).
+        bootstrap_values = torch.cat(
+            [
+                values_time_major[0][1:],  # 0th ts values from "next row"
+                values_time_major[0][0:1],  # <- can use any arbitrary value here
+            ],
+            dim=0,
+        )
+
+        # TODO(Artur): In the old impala code, actions were unsqueezed if they were
+        #  multi_discrete. Find out why and if we need to do the same here.
+        #  actions = actions if is_multidiscrete else torch.unsqueeze(actions, dim=1)
+        target_actions_logp_time_major = make_time_major(
+            target_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        behaviour_actions_logp_time_major = make_time_major(
+            behaviour_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        rewards_time_major = make_time_major(
+            batch[Columns.REWARDS],
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+
+        # the discount factor that is used should be gamma except for timesteps where
+        # the episode is terminated. In that case, the discount factor should be 0.
+        discounts_time_major = (
+            1.0
+            - make_time_major(
+                batch[Columns.TERMINATEDS],
+                trajectory_len=rollout_frag_or_episode_len,
+                recurrent_seq_len=recurrent_seq_len,
+            ).type(dtype=torch.float32)
+        ) * config.gamma
+
+        # Note that vtrace will compute the main loop on the CPU for better performance.
+        vtrace_adjusted_target_values, pg_advantages = vtrace_torch(
+            target_action_log_probs=target_actions_logp_time_major,
+            behaviour_action_log_probs=behaviour_actions_logp_time_major,
+            discounts=discounts_time_major,
+            rewards=rewards_time_major,
+            values=values_time_major,
+            bootstrap_values=bootstrap_values,
+            clip_rho_threshold=config.vtrace_clip_rho_threshold,
+            clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold,
+        )
+
+        # The policy gradients loss.
+        pi_loss = -torch.sum(
+            target_actions_logp_time_major * pg_advantages * loss_mask_time_major
+        )
+        mean_pi_loss = pi_loss / size_loss_mask
+
+        # The baseline loss.
+        delta = values_time_major - vtrace_adjusted_target_values
+        vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major)
+        mean_vf_loss = vf_loss / size_loss_mask
+
+        # The entropy loss.
+        entropy_loss = -torch.sum(target_policy_dist.entropy() * loss_mask)
+        mean_entropy_loss = entropy_loss / size_loss_mask
+
+        # The summed weighted loss.
+        total_loss = (
+            mean_pi_loss
+            + mean_vf_loss * config.vf_loss_coeff
+            + (
+                mean_entropy_loss
+                * self.entropy_coeff_schedulers_per_module[
+                    module_id
+                ].get_current_value()
+            )
+        )
+
+        # Log important loss stats.
+        self.metrics.log_dict(
+            {
+                "pi_loss": pi_loss,
+                "mean_pi_loss": mean_pi_loss,
+                "vf_loss": vf_loss,
+                "mean_vf_loss": mean_vf_loss,
+                ENTROPY_KEY: -mean_entropy_loss,
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        # Return the total loss.
+        return total_loss
+
+
+ImpalaTorchLearner = IMPALATorchLearner
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/vtrace_torch_v2.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/vtrace_torch_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bd40786b3a291e45c139ff93750506ed314ff71
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/vtrace_torch_v2.py
@@ -0,0 +1,168 @@
+from typing import List, Union
+from ray.rllib.utils.framework import try_import_torch
+
+torch, nn = try_import_torch()
+
+
+def make_time_major(
+    tensor: Union["torch.Tensor", List["torch.Tensor"]],
+    *,
+    trajectory_len: int = None,
+    recurrent_seq_len: int = None,
+):
+    """Swaps batch and trajectory axis.
+
+    Args:
+        tensor: A tensor or list of tensors to swap the axis of.
+            NOTE: Each tensor must have the shape [B * T] where B is the batch size and
+            T is the trajectory length.
+        trajectory_len: The length of each trajectory being transformed.
+            If None then `recurrent_seq_len` must be set.
+        recurrent_seq_len: Sequence lengths if recurrent.
+            If None then `trajectory_len` must be set.
+
+    Returns:
+        res: A tensor with swapped axes or a list of tensors with
+        swapped axes.
+    """
+    if isinstance(tensor, (list, tuple)):
+        return [
+            make_time_major(_tensor, trajectory_len, recurrent_seq_len)
+            for _tensor in tensor
+        ]
+
+    assert (
+        trajectory_len is not None or recurrent_seq_len is not None
+    ), "Either trajectory_len or recurrent_seq_len must be set."
+
+    # Figure out the sizes of the final B and T axes.
+    if recurrent_seq_len is not None:
+        assert len(tensor.shape) == 2
+        # Swap B and T axes.
+        tensor = torch.transpose(tensor, 1, 0)
+        return tensor
+    else:
+        T = trajectory_len
+        # Zero-pad, if necessary.
+        tensor_0 = tensor.shape[0]
+        B = tensor_0 // T
+        if B != (tensor_0 / T):
+            assert len(tensor.shape) == 1
+            tensor = torch.cat(
+                [
+                    tensor,
+                    torch.zeros(
+                        trajectory_len - tensor_0 % T,
+                        dtype=tensor.dtype,
+                        device=tensor.device,
+                    ),
+                ]
+            )
+            B += 1
+
+    # Reshape tensor (break up B axis into 2 axes: B and T).
+    tensor = torch.reshape(tensor, [B, T] + list(tensor.shape[1:]))
+
+    # Swap B and T axes.
+    tensor = torch.transpose(tensor, 1, 0)
+
+    return tensor
+
+
+def vtrace_torch(
+    *,
+    target_action_log_probs: "torch.Tensor",
+    behaviour_action_log_probs: "torch.Tensor",
+    discounts: "torch.Tensor",
+    rewards: "torch.Tensor",
+    values: "torch.Tensor",
+    bootstrap_values: "torch.Tensor",
+    clip_rho_threshold: Union[float, "torch.Tensor"] = 1.0,
+    clip_pg_rho_threshold: Union[float, "torch.Tensor"] = 1.0,
+):
+    """V-trace for softmax policies implemented with torch.
+
+    Calculates V-trace actor critic targets for softmax polices as described in
+    "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner
+    Architectures" by Espeholt, Soyer, Munos et al. (https://arxiv.org/abs/1802.01561)
+
+    The V-trace implementation used here closely resembles the one found in the
+    scalable-agent repository by Google DeepMind, available at
+    https://github.com/deepmind/scalable_agent. This version has been optimized to
+    minimize the number of floating-point operations required per V-Trace
+    calculation, achieved through the use of dynamic programming techniques. It's
+    important to note that the mathematical expressions used in this implementation
+    may appear quite different from those presented in the IMPALA paper.
+
+    The following terminology applies:
+        - `target policy` refers to the policy we are interested in improving.
+        - `behaviour policy` refers to the policy that generated the given
+            rewards and actions.
+        - `T` refers to the time dimension. This is usually either the length of the
+            trajectory or the length of the sequence if recurrent.
+        - `B` refers to the batch size.
+
+    Args:
+        target_action_log_probs: Action log probs from the target policy. A float32
+            tensor of shape [T, B].
+        behaviour_action_log_probs: Action log probs from the behaviour policy. A
+            float32 tensor of shape [T, B].
+        discounts: A float32 tensor of shape [T, B] with the discount encountered when
+            following the behaviour policy. This will be 0 for terminal timesteps
+            (done=True) and gamma (the discount factor) otherwise.
+        rewards: A float32 tensor of shape [T, B] with the rewards generated by
+            following the behaviour policy.
+        values: A float32 tensor of shape [T, B] with the value function estimates
+            wrt. the target policy.
+        bootstrap_values: A float32 of shape [B] with the value function estimate at
+            time T.
+        clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
+            importance weights (rho) when calculating the baseline targets (vs).
+            rho^bar in the paper.
+        clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
+            on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
+    """
+    log_rhos = target_action_log_probs - behaviour_action_log_probs
+
+    rhos = torch.exp(log_rhos)
+    if clip_rho_threshold is not None:
+        clipped_rhos = torch.clamp(rhos, max=clip_rho_threshold)
+    else:
+        clipped_rhos = rhos
+
+    cs = torch.clamp(rhos, max=1.0)
+    # Append bootstrapped value to get [v1, ..., v_t+1]
+    values_t_plus_1 = torch.cat(
+        [values[1:], torch.unsqueeze(bootstrap_values, 0)], axis=0
+    )
+
+    deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)
+
+    # Only move the for-loop to CPU.
+    discounts_cpu = discounts.to("cpu")
+    cs_cpu = cs.to("cpu")
+    deltas_cpu = deltas.to("cpu")
+    vs_minus_v_xs_cpu = [torch.zeros_like(bootstrap_values, device="cpu")]
+    for i in reversed(range(len(discounts_cpu))):
+        discount_t, c_t, delta_t = discounts_cpu[i], cs_cpu[i], deltas_cpu[i]
+        vs_minus_v_xs_cpu.append(delta_t + discount_t * c_t * vs_minus_v_xs_cpu[-1])
+    vs_minus_v_xs_cpu = torch.stack(vs_minus_v_xs_cpu[1:])
+    # Move results back to GPU - if applicable.
+    vs_minus_v_xs = vs_minus_v_xs_cpu.to(deltas.device)
+
+    # Reverse the results back to original order.
+    vs_minus_v_xs = torch.flip(vs_minus_v_xs, dims=[0])
+
+    # Add V(x_s) to get v_s.
+    vs = torch.add(vs_minus_v_xs, values)
+
+    # Advantage for policy gradient.
+    vs_t_plus_1 = torch.cat([vs[1:], torch.unsqueeze(bootstrap_values, 0)], axis=0)
+    if clip_pg_rho_threshold is not None:
+        clipped_pg_rhos = torch.clamp(rhos, max=clip_pg_rho_threshold)
+    else:
+        clipped_pg_rhos = rhos
+    pg_advantages = clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values)
+
+    # Make sure no gradients backpropagated through the returned values.
+    return torch.detach(vs), torch.detach(pg_advantages)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/centralized_critic.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/centralized_critic.py
new file mode 100644
index 0000000000000000000000000000000000000000..14380b789908417052a85dc30df2638105f05453
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/centralized_critic.py
@@ -0,0 +1,319 @@
+# @OldAPIStack
+
+# ***********************************************************************************
+# IMPORTANT NOTE: This script uses the old API stack and will soon be replaced by
+# `ray.rllib.examples.multi_agent.pettingzoo_shared_value_function.py`!
+# ***********************************************************************************
+
+"""An example of customizing PPO to leverage a centralized critic.
+
+Here the model and policy are hard-coded to implement a centralized critic
+for TwoStepGame, but you can adapt this for your own use cases.
+
+Compared to simply running `rllib/examples/two_step_game.py --run=PPO`,
+this centralized critic version reaches vf_explained_variance=1.0 more stably
+since it takes into account the opponent actions as well as the policy's.
+Note that this is also using two independent policies instead of weight-sharing
+with one.
+
+See also: centralized_critic_2.py for a simpler approach that instead
+modifies the environment.
+"""
+
+import argparse
+from gymnasium.spaces import Discrete
+import numpy as np
+import os
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.ppo.ppo import PPO, PPOConfig
+from ray.rllib.algorithms.ppo.ppo_tf_policy import (
+    PPOTF1Policy,
+    PPOTF2Policy,
+)
+from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy
+from ray.rllib.evaluation.postprocessing import compute_advantages, Postprocessing
+from ray.rllib.examples.envs.classes.multi_agent.two_step_game import TwoStepGame
+from ray.rllib.examples._old_api_stack.models.centralized_critic_models import (
+    CentralizedCriticModel,
+    TorchCentralizedCriticModel,
+)
+from ray.rllib.models import ModelCatalog
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.test_utils import check_learning_achieved
+from ray.rllib.utils.tf_utils import explained_variance, make_tf_callable
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+OPPONENT_OBS = "opponent_obs"
+OPPONENT_ACTION = "opponent_action"
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=100, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
+)
+parser.add_argument(
+    "--stop-reward", type=float, default=7.99, help="Reward at which we stop training."
+)
+
+
+class CentralizedValueMixin:
+    """Add method to evaluate the central value function from the model."""
+
+    def __init__(self):
+        if self.config["framework"] != "torch":
+            self.compute_central_vf = make_tf_callable(self.get_session())(
+                self.model.central_value_function
+            )
+        else:
+            self.compute_central_vf = self.model.central_value_function
+
+
+# Grabs the opponent obs/act and includes it in the experience train_batch,
+# and computes GAE using the central vf predictions.
+def centralized_critic_postprocessing(
+    policy, sample_batch, other_agent_batches=None, episode=None
+):
+    pytorch = policy.config["framework"] == "torch"
+    if (pytorch and hasattr(policy, "compute_central_vf")) or (
+        not pytorch and policy.loss_initialized()
+    ):
+        assert other_agent_batches is not None
+        [(_, _, opponent_batch)] = list(other_agent_batches.values())
+
+        # also record the opponent obs and actions in the trajectory
+        sample_batch[OPPONENT_OBS] = opponent_batch[SampleBatch.CUR_OBS]
+        sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS]
+
+        # overwrite default VF prediction with the central VF
+        if args.framework == "torch":
+            sample_batch[SampleBatch.VF_PREDS] = (
+                policy.compute_central_vf(
+                    convert_to_torch_tensor(
+                        sample_batch[SampleBatch.CUR_OBS], policy.device
+                    ),
+                    convert_to_torch_tensor(sample_batch[OPPONENT_OBS], policy.device),
+                    convert_to_torch_tensor(
+                        sample_batch[OPPONENT_ACTION], policy.device
+                    ),
+                )
+                .cpu()
+                .detach()
+                .numpy()
+            )
+        else:
+            sample_batch[SampleBatch.VF_PREDS] = convert_to_numpy(
+                policy.compute_central_vf(
+                    sample_batch[SampleBatch.CUR_OBS],
+                    sample_batch[OPPONENT_OBS],
+                    sample_batch[OPPONENT_ACTION],
+                )
+            )
+    else:
+        # Policy hasn't been initialized yet, use zeros.
+        sample_batch[OPPONENT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS])
+        sample_batch[OPPONENT_ACTION] = np.zeros_like(sample_batch[SampleBatch.ACTIONS])
+        sample_batch[SampleBatch.VF_PREDS] = np.zeros_like(
+            sample_batch[SampleBatch.REWARDS], dtype=np.float32
+        )
+
+    completed = sample_batch[SampleBatch.TERMINATEDS][-1]
+    if completed:
+        last_r = 0.0
+    else:
+        last_r = sample_batch[SampleBatch.VF_PREDS][-1]
+
+    train_batch = compute_advantages(
+        sample_batch,
+        last_r,
+        policy.config["gamma"],
+        policy.config["lambda"],
+        use_gae=policy.config["use_gae"],
+    )
+    return train_batch
+
+
+# Copied from PPO but optimizing the central value function.
+def loss_with_central_critic(policy, base_policy, model, dist_class, train_batch):
+    # Save original value function.
+    vf_saved = model.value_function
+
+    # Calculate loss with a custom value function.
+    model.value_function = lambda: policy.model.central_value_function(
+        train_batch[SampleBatch.CUR_OBS],
+        train_batch[OPPONENT_OBS],
+        train_batch[OPPONENT_ACTION],
+    )
+    policy._central_value_out = model.value_function()
+    loss = base_policy.loss(model, dist_class, train_batch)
+
+    # Restore original value function.
+    model.value_function = vf_saved
+
+    return loss
+
+
+def central_vf_stats(policy, train_batch):
+    # Report the explained variance of the central value function.
+    return {
+        "vf_explained_var": explained_variance(
+            train_batch[Postprocessing.VALUE_TARGETS], policy._central_value_out
+        )
+    }
+
+
+def get_ccppo_policy(base):
+    class CCPPOTFPolicy(CentralizedValueMixin, base):
+        def __init__(self, observation_space, action_space, config):
+            base.__init__(self, observation_space, action_space, config)
+            CentralizedValueMixin.__init__(self)
+
+        @override(base)
+        def loss(self, model, dist_class, train_batch):
+            # Use super() to get to the base PPO policy.
+            # This special loss function utilizes a shared
+            # value function defined on self, and the loss function
+            # defined on PPO policies.
+            return loss_with_central_critic(
+                self, super(), model, dist_class, train_batch
+            )
+
+        @override(base)
+        def postprocess_trajectory(
+            self, sample_batch, other_agent_batches=None, episode=None
+        ):
+            return centralized_critic_postprocessing(
+                self, sample_batch, other_agent_batches, episode
+            )
+
+        @override(base)
+        def stats_fn(self, train_batch: SampleBatch):
+            stats = super().stats_fn(train_batch)
+            stats.update(central_vf_stats(self, train_batch))
+            return stats
+
+    return CCPPOTFPolicy
+
+
+CCPPOStaticGraphTFPolicy = get_ccppo_policy(PPOTF1Policy)
+CCPPOEagerTFPolicy = get_ccppo_policy(PPOTF2Policy)
+
+
+class CCPPOTorchPolicy(CentralizedValueMixin, PPOTorchPolicy):
+    def __init__(self, observation_space, action_space, config):
+        PPOTorchPolicy.__init__(self, observation_space, action_space, config)
+        CentralizedValueMixin.__init__(self)
+
+    @override(PPOTorchPolicy)
+    def loss(self, model, dist_class, train_batch):
+        return loss_with_central_critic(self, super(), model, dist_class, train_batch)
+
+    @override(PPOTorchPolicy)
+    def postprocess_trajectory(
+        self, sample_batch, other_agent_batches=None, episode=None
+    ):
+        return centralized_critic_postprocessing(
+            self, sample_batch, other_agent_batches, episode
+        )
+
+
+class CentralizedCritic(PPO):
+    @classmethod
+    @override(PPO)
+    def get_default_policy_class(cls, config):
+        if config["framework"] == "torch":
+            return CCPPOTorchPolicy
+        elif config["framework"] == "tf":
+            return CCPPOStaticGraphTFPolicy
+        else:
+            return CCPPOEagerTFPolicy
+
+
+if __name__ == "__main__":
+    ray.init(local_mode=True)
+    args = parser.parse_args()
+
+    ModelCatalog.register_custom_model(
+        "cc_model",
+        TorchCentralizedCriticModel
+        if args.framework == "torch"
+        else CentralizedCriticModel,
+    )
+
+    config = (
+        PPOConfig()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .environment(TwoStepGame)
+        .framework(args.framework)
+        .env_runners(batch_mode="complete_episodes", num_env_runners=0)
+        .training(model={"custom_model": "cc_model"})
+        .multi_agent(
+            policies={
+                "pol1": (
+                    None,
+                    Discrete(6),
+                    TwoStepGame.action_space,
+                    # `framework` would also be ok here.
+                    PPOConfig.overrides(framework_str=args.framework),
+                ),
+                "pol2": (
+                    None,
+                    Discrete(6),
+                    TwoStepGame.action_space,
+                    # `framework` would also be ok here.
+                    PPOConfig.overrides(framework_str=args.framework),
+                ),
+            },
+            policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "pol1"
+            if agent_id == 0
+            else "pol2",
+        )
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+    )
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+
+    tuner = tune.Tuner(
+        CentralizedCritic,
+        param_space=config.to_dict(),
+        run_config=air.RunConfig(stop=stop, verbose=1),
+    )
+    results = tuner.fit()
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py
new file mode 100644
index 0000000000000000000000000000000000000000..44745c8722b84553f14e487c4aa7d3047632ff1b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py
@@ -0,0 +1,157 @@
+# @OldAPIStack
+
+"""
+Adapted (time-dependent) GAE for PPO algorithm that you can activate by setting
+use_adapted_gae=True in the policy config. Additionally, it's required that
+"callbacks" include the custom callback class in the Algorithm's config.
+Furthermore, the env must return in its info dictionary a key-value pair of
+the form "d_ts": ... where the value is the length (time) of recent agent step.
+
+This adapted, time-dependent computation of advantages may be useful in cases
+where agent's actions take various times and thus time steps are not
+equidistant (https://docdro.id/400TvlR)
+"""
+
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.utils.annotations import override
+import numpy as np
+
+
+class MyCallbacks(RLlibCallback):
+    @override(RLlibCallback)
+    def on_postprocess_trajectory(
+        self,
+        *,
+        worker,
+        episode,
+        agent_id,
+        policy_id,
+        policies,
+        postprocessed_batch,
+        original_batches,
+        **kwargs
+    ):
+        super().on_postprocess_trajectory(
+            worker=worker,
+            episode=episode,
+            agent_id=agent_id,
+            policy_id=policy_id,
+            policies=policies,
+            postprocessed_batch=postprocessed_batch,
+            original_batches=original_batches,
+            **kwargs
+        )
+
+        if policies[policy_id].config.get("use_adapted_gae", False):
+            policy = policies[policy_id]
+            assert policy.config[
+                "use_gae"
+            ], "Can't use adapted gae without use_gae=True!"
+
+            info_dicts = postprocessed_batch[SampleBatch.INFOS]
+            assert np.all(
+                ["d_ts" in info_dict for info_dict in info_dicts]
+            ), "Info dicts in sample batch must contain data 'd_ts' \
+                (=ts[i+1]-ts[i] length of time steps)!"
+
+            d_ts = np.array(
+                [np.float(info_dict.get("d_ts")) for info_dict in info_dicts]
+            )
+            assert np.all(
+                [e.is_integer() for e in d_ts]
+            ), "Elements of 'd_ts' (length of time steps) must be integer!"
+
+            # Trajectory is actually complete -> last r=0.0.
+            if postprocessed_batch[SampleBatch.TERMINATEDS][-1]:
+                last_r = 0.0
+            # Trajectory has been truncated -> last r=VF estimate of last obs.
+            else:
+                # Input dict is provided to us automatically via the Model's
+                # requirements. It's a single-timestep (last one in trajectory)
+                # input_dict.
+                # Create an input dict according to the Model's requirements.
+                input_dict = postprocessed_batch.get_single_step_input_dict(
+                    policy.model.view_requirements, index="last"
+                )
+                last_r = policy._value(**input_dict)
+
+            gamma = policy.config["gamma"]
+            lambda_ = policy.config["lambda"]
+
+            vpred_t = np.concatenate(
+                [postprocessed_batch[SampleBatch.VF_PREDS], np.array([last_r])]
+            )
+            delta_t = (
+                postprocessed_batch[SampleBatch.REWARDS]
+                + gamma**d_ts * vpred_t[1:]
+                - vpred_t[:-1]
+            )
+            # This formula for the advantage is an adaption of
+            # "Generalized Advantage Estimation"
+            # (https://arxiv.org/abs/1506.02438) which accounts for time steps
+            # of irregular length (see proposal here ).
+            # NOTE: last time step delta is not required
+            postprocessed_batch[
+                Postprocessing.ADVANTAGES
+            ] = generalized_discount_cumsum(delta_t, d_ts[:-1], gamma * lambda_)
+            postprocessed_batch[Postprocessing.VALUE_TARGETS] = (
+                postprocessed_batch[Postprocessing.ADVANTAGES]
+                + postprocessed_batch[SampleBatch.VF_PREDS]
+            ).astype(np.float32)
+
+            postprocessed_batch[Postprocessing.ADVANTAGES] = postprocessed_batch[
+                Postprocessing.ADVANTAGES
+            ].astype(np.float32)
+
+
+def generalized_discount_cumsum(
+    x: np.ndarray, deltas: np.ndarray, gamma: float
+) -> np.ndarray:
+    """Calculates the 'time-dependent' discounted cumulative sum over a
+    (reward) sequence `x`.
+
+    Recursive equations:
+
+    y[t] - gamma**deltas[t+1]*y[t+1] = x[t]
+
+    reversed(y)[t] - gamma**reversed(deltas)[t-1]*reversed(y)[t-1] =
+    reversed(x)[t]
+
+    Args:
+        x (np.ndarray): A sequence of rewards or one-step TD residuals.
+        deltas (np.ndarray): A sequence of time step deltas (length of time
+            steps).
+        gamma: The discount factor gamma.
+
+    Returns:
+        np.ndarray: The sequence containing the 'time-dependent' discounted
+            cumulative sums for each individual element in `x` till the end of
+            the trajectory.
+
+    .. testcode::
+        :skipif: True
+
+        x = np.array([0.0, 1.0, 2.0, 3.0])
+        deltas = np.array([1.0, 4.0, 15.0])
+        gamma = 0.9
+        generalized_discount_cumsum(x, deltas, gamma)
+
+    .. testoutput::
+
+        array([0.0 + 0.9^1.0*1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
+               1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
+               2.0 + 0.9^15.0*3.0,
+               3.0])
+    """
+    reversed_x = x[::-1]
+    reversed_deltas = deltas[::-1]
+    reversed_y = np.empty_like(x)
+    reversed_y[0] = reversed_x[0]
+    for i in range(1, x.size):
+        reversed_y[i] = (
+            reversed_x[i] + gamma ** reversed_deltas[i - 1] * reversed_y[i - 1]
+        )
+
+    return reversed_y[::-1]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_in_sequence.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_in_sequence.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2872a6e4aca137da03af2fde2341b1d3f40643a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_in_sequence.py
@@ -0,0 +1,87 @@
+"""Example of running a multi-agent experiment w/ agents taking turns (sequence).
+
+This example:
+    - demonstrates how to write your own (multi-agent) environment using RLlib's
+    MultiAgentEnv API.
+    - shows how to implement the `reset()` and `step()` methods of the env such that
+    the agents act in a fixed sequence (taking turns).
+    - shows how to configure and setup this environment class within an RLlib
+    Algorithm config.
+    - runs the experiment with the configured algo, trying to solve the environment.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see results similar to the following in your console output:
++---------------------------+----------+--------+------------------+--------+
+| Trial name                | status   |   iter |   total time (s) |     ts |
+|---------------------------+----------+--------+------------------+--------+
+| PPO_TicTacToe_957aa_00000 | RUNNING  |     25 |          96.7452 | 100000 |
++---------------------------+----------+--------+------------------+--------+
++-------------------+------------------+------------------+
+|   combined return |   return player2 |   return player1 |
+|-------------------+------------------+------------------|
+|                -2 |             1.15 |            -0.85 |
++-------------------+------------------+------------------+
+
+Note that even though we are playing a zero-sum game, the overall return should start
+at some negative values due to the misplacement penalty of our (simplified) TicTacToe
+game.
+"""
+from ray.rllib.examples.envs.classes.multi_agent.tic_tac_toe import TicTacToe
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env  # noqa
+
+
+parser = add_rllib_example_script_args(
+    default_reward=-4.0, default_iters=50, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.num_agents == 2, "Must set --num-agents=2 when running this script!"
+
+    # You can also register the env creator function explicitly with:
+    # register_env("tic_tac_toe", lambda cfg: TicTacToe())
+
+    # Or allow the RLlib user to set more c'tor options via their algo config:
+    # config.environment(env_config={[c'tor arg name]: [value]})
+    # register_env("tic_tac_toe", lambda cfg: TicTacToe(cfg))
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(TicTacToe)
+        .multi_agent(
+            # Define two policies.
+            policies={"player1", "player2"},
+            # Map agent "player1" to policy "player1" and agent "player2" to policy
+            # "player2".
+            policy_mapping_fn=lambda agent_id, episode, **kw: agent_id,
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_simultaneously.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_simultaneously.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e9ce55ce29b00c4a53be231083fc2ef645fddf1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_simultaneously.py
@@ -0,0 +1,108 @@
+"""Example of running a multi-agent experiment w/ agents always acting simultaneously.
+
+This example:
+    - demonstrates how to write your own (multi-agent) environment using RLlib's
+    MultiAgentEnv API.
+    - shows how to implement the `reset()` and `step()` methods of the env such that
+    the agents act simultaneously.
+    - shows how to configure and setup this environment class within an RLlib
+    Algorithm config.
+    - runs the experiment with the configured algo, trying to solve the environment.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --sheldon-cooper-mode`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see results similar to the following in your console output:
+
++-----------------------------------+----------+--------+------------------+-------+
+| Trial name                        | status   |   iter |   total time (s) |    ts |
+|-----------------------------------+----------+--------+------------------+-------+
+| PPO_RockPaperScissors_8cef7_00000 | RUNNING  |      3 |          16.5348 | 12000 |
++-----------------------------------+----------+--------+------------------+-------+
++-------------------+------------------+------------------+
+|   combined return |   return player2 |   return player1 |
+|-------------------+------------------+------------------|
+|                 0 |            -0.15 |             0.15 |
++-------------------+------------------+------------------+
+
+Note that b/c we are playing a zero-sum game, the overall return remains 0.0 at
+all times.
+"""
+from ray.rllib.examples.envs.classes.multi_agent.rock_paper_scissors import (
+    RockPaperScissors,
+)
+from ray.rllib.connectors.env_to_module.flatten_observations import FlattenObservations
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env  # noqa
+
+
+parser = add_rllib_example_script_args(
+    default_reward=0.9, default_iters=50, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
+parser.add_argument(
+    "--sheldon-cooper-mode",
+    action="store_true",
+    help="Whether to add two more actions to the game: Lizard and Spock. "
+    "Watch here for more details :) https://www.youtube.com/watch?v=x5Q6-wMx-K8",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.num_agents == 2, "Must set --num-agents=2 when running this script!"
+
+    # You can also register the env creator function explicitly with:
+    # register_env("env", lambda cfg: RockPaperScissors({"sheldon_cooper_mode": False}))
+
+    # Or you can hard code certain settings into the Env's constructor (`config`).
+    # register_env(
+    #    "rock-paper-scissors-w-sheldon-mode-activated",
+    #    lambda config: RockPaperScissors({**config, **{"sheldon_cooper_mode": True}}),
+    # )
+
+    # Or allow the RLlib user to set more c'tor options via their algo config:
+    # config.environment(env_config={[c'tor arg name]: [value]})
+    # register_env("rock-paper-scissors", lambda cfg: RockPaperScissors(cfg))
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            RockPaperScissors,
+            env_config={"sheldon_cooper_mode": args.sheldon_cooper_mode},
+        )
+        .env_runners(
+            env_to_module_connector=lambda env: FlattenObservations(multi_agent=True),
+        )
+        .multi_agent(
+            # Define two policies.
+            policies={"player1", "player2"},
+            # Map agent "player1" to policy "player1" and agent "player2" to policy
+            # "player2".
+            policy_mapping_fn=lambda agent_id, episode, **kw: agent_id,
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/async_gym_env_vectorization.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/async_gym_env_vectorization.py
new file mode 100644
index 0000000000000000000000000000000000000000..06a2d7d0982a3961d5a9c8be09e8c0c925cde288
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/async_gym_env_vectorization.py
@@ -0,0 +1,142 @@
+"""Example demo'ing async gym vector envs, in which sub-envs have their own process.
+
+Setting up env vectorization works through setting the `config.num_envs_per_env_runner`
+value to > 1. However, by default the n sub-environments are stepped through
+sequentially, rather than in parallel.
+
+This script shows the effect of setting the `config.gym_env_vectorize_mode` from its
+default value of "SYNC" (all sub envs are located in the same EnvRunner process)
+to "ASYNC" (all sub envs in each EnvRunner get their own process).
+
+This example:
+    - shows, which config settings to change in order to switch from sub-envs being
+    stepped in sequence to each sub-envs owning its own process (and compute resource)
+    and thus the vector being stepped in parallel.
+    - shows, how this setup can increase EnvRunner performance significantly, especially
+    for heavier, slower environments.
+    - uses an artificially slow CartPole-v1 environment for demonstration purposes.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack `
+
+Use the `--vectorize-mode=BOTH` option to run both modes (SYNC and ASYNC)
+through Tune at the same time and get a better comparison of the throughputs
+achieved.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see results similar to the following in your console output
+when using the
+
++--------------------------+------------+------------------------+------+
+| Trial name               | status     | gym_env_vectorize_mode | iter |
+|                          |            |                        |      |
+|--------------------------+------------+------------------------+------+
+| PPO_slow-env_6ddf4_00000 | TERMINATED | SYNC                   |    4 |
+| PPO_slow-env_6ddf4_00001 | TERMINATED | ASYNC                  |    4 |
++--------------------------+------------+------------------------+------+
++------------------+----------------------+------------------------+
+|   total time (s) |  episode_return_mean |   num_env_steps_sample |
+|                  |                      |             d_lifetime |
+|------------------+----------------------+------------------------+
+|          60.8794 |                73.53 |                  16040 |
+|          19.1203 |                73.86 |                  16037 |
++------------------+----------------------+------------------------+
+
+You can see that the ASYNC mode, given that the env is sufficiently slow,
+achieves much better results when using vectorization.
+
+You should see no difference, however, when only using
+`--num-envs-per-env-runner=1`.
+"""
+import time
+
+import gymnasium as gym
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray import tune
+
+parser = add_rllib_example_script_args(default_reward=60.0)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    env="CartPole-v1",
+    num_envs_per_env_runner=6,
+)
+parser.add_argument(
+    "--vectorize-mode",
+    type=str,
+    default="ASYNC",
+    help="The value `gym.envs.registration.VectorizeMode` to use for env "
+    "vectorization. SYNC steps through all sub-envs in sequence. ASYNC (default) "
+    "parallelizes sub-envs through multiprocessing and can speed up EnvRunners "
+    "significantly. Use the special value `BOTH` to run both ASYNC and SYNC through a "
+    "Tune grid-search.",
+)
+
+
+class SlowEnv(gym.ObservationWrapper):
+    def observation(self, observation):
+        time.sleep(0.005)
+        return observation
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.no_tune and args.vectorize_mode == "BOTH":
+        raise ValueError(
+            "Can't run this script with both --no-tune and --vectorize-mode=BOTH!"
+        )
+
+    # Wrap the env with the slowness wrapper.
+    def _env_creator(cfg):
+        return SlowEnv(gym.make(args.env, **cfg))
+
+    tune.register_env("slow-env", _env_creator)
+
+    if args.vectorize_mode == "BOTH" and args.no_tune:
+        raise ValueError(
+            "`--vectorize-mode=BOTH` and `--no-tune` not allowed in combination!"
+        )
+
+    base_config = (
+        PPOConfig()
+        .environment("slow-env")
+        .env_runners(
+            gym_env_vectorize_mode=(
+                tune.grid_search(["SYNC", "ASYNC"])
+                if args.vectorize_mode == "BOTH"
+                else args.vectorize_mode
+            ),
+        )
+    )
+
+    results = run_rllib_example_script_experiment(base_config, args)
+
+    # Compare the throughputs and assert that ASYNC is much faster than SYNC.
+    if args.vectorize_mode == "BOTH":
+        throughput_sync = (
+            results[0].metrics["num_env_steps_sampled_lifetime"]
+            / results[0].metrics["time_total_s"]
+        )
+        throughput_async = (
+            results[1].metrics["num_env_steps_sampled_lifetime"]
+            / results[1].metrics["time_total_s"]
+        )
+        assert throughput_async > throughput_sync
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/action_mask_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/action_mask_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c67db342f72805fff52672ccb767b62a92c3167
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/action_mask_env.py
@@ -0,0 +1,42 @@
+from gymnasium.spaces import Box, Dict, Discrete
+import numpy as np
+
+from ray.rllib.examples.envs.classes.random_env import RandomEnv
+
+
+class ActionMaskEnv(RandomEnv):
+    """A randomly acting environment that publishes an action-mask each step."""
+
+    def __init__(self, config):
+        super().__init__(config)
+        # Masking only works for Discrete actions.
+        assert isinstance(self.action_space, Discrete)
+        # Add action_mask to observations.
+        self.observation_space = Dict(
+            {
+                "action_mask": Box(0.0, 1.0, shape=(self.action_space.n,)),
+                "observations": self.observation_space,
+            }
+        )
+        self.valid_actions = None
+
+    def reset(self, *, seed=None, options=None):
+        obs, info = super().reset()
+        self._fix_action_mask(obs)
+        return obs, info
+
+    def step(self, action):
+        # Check whether action is valid.
+        if not self.valid_actions[action]:
+            raise ValueError(
+                f"Invalid action ({action}) sent to env! "
+                f"valid_actions={self.valid_actions}"
+            )
+        obs, rew, done, truncated, info = super().step(action)
+        self._fix_action_mask(obs)
+        return obs, rew, done, truncated, info
+
+    def _fix_action_mask(self, obs):
+        # Fix action-mask: Everything larger 0.5 is 1.0, everything else 0.0.
+        self.valid_actions = np.round(obs["action_mask"])
+        obs["action_mask"] = self.valid_actions
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_crashing.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_crashing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe5e4f14b4f4dd386242d0105e2fd5b52103bcc8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_crashing.py
@@ -0,0 +1,182 @@
+import logging
+from gymnasium.envs.classic_control import CartPoleEnv
+import numpy as np
+import time
+
+from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.error import EnvError
+
+logger = logging.getLogger(__name__)
+
+
+class CartPoleCrashing(CartPoleEnv):
+    """A CartPole env that crashes (or stalls) from time to time.
+
+    Useful for testing faulty sub-env (within a vectorized env) handling by
+    EnvRunners.
+
+    After crashing, the env expects a `reset()` call next (calling `step()` will
+    result in yet another error), which may or may not take a very long time to
+    complete. This simulates the env having to reinitialize some sub-processes, e.g.
+    an external connection.
+
+    The env can also be configured to stall (and do nothing during a call to `step()`)
+    from time to time for a configurable amount of time.
+    """
+
+    def __init__(self, config=None):
+        super().__init__()
+
+        self.config = config if config is not None else {}
+
+        # Crash probability (in each `step()`).
+        self.p_crash = config.get("p_crash", 0.005)
+        # Crash probability when `reset()` is called.
+        self.p_crash_reset = config.get("p_crash_reset", 0.0)
+        # Crash exactly after every n steps. If a 2-tuple, will uniformly sample
+        # crash timesteps from in between the two given values.
+        self.crash_after_n_steps = config.get("crash_after_n_steps")
+        self._crash_after_n_steps = None
+        assert (
+            self.crash_after_n_steps is None
+            or isinstance(self.crash_after_n_steps, int)
+            or (
+                isinstance(self.crash_after_n_steps, tuple)
+                and len(self.crash_after_n_steps) == 2
+            )
+        )
+        # Only ever crash, if on certain worker indices.
+        faulty_indices = config.get("crash_on_worker_indices", None)
+        if faulty_indices and config.worker_index not in faulty_indices:
+            self.p_crash = 0.0
+            self.p_crash_reset = 0.0
+            self.crash_after_n_steps = None
+
+        # Stall probability (in each `step()`).
+        self.p_stall = config.get("p_stall", 0.0)
+        # Stall probability when `reset()` is called.
+        self.p_stall_reset = config.get("p_stall_reset", 0.0)
+        # Stall exactly after every n steps.
+        self.stall_after_n_steps = config.get("stall_after_n_steps")
+        self._stall_after_n_steps = None
+        # Amount of time to stall. If a 2-tuple, will uniformly sample from in between
+        # the two given values.
+        self.stall_time_sec = config.get("stall_time_sec")
+        assert (
+            self.stall_time_sec is None
+            or isinstance(self.stall_time_sec, (int, float))
+            or (
+                isinstance(self.stall_time_sec, tuple) and len(self.stall_time_sec) == 2
+            )
+        )
+
+        # Only ever stall, if on certain worker indices.
+        faulty_indices = config.get("stall_on_worker_indices", None)
+        if faulty_indices and config.worker_index not in faulty_indices:
+            self.p_stall = 0.0
+            self.p_stall_reset = 0.0
+            self.stall_after_n_steps = None
+
+        # Timestep counter for the ongoing episode.
+        self.timesteps = 0
+
+        # Time in seconds to initialize (in this c'tor).
+        sample = 0.0
+        if "init_time_s" in config:
+            sample = (
+                config["init_time_s"]
+                if not isinstance(config["init_time_s"], tuple)
+                else np.random.uniform(
+                    config["init_time_s"][0], config["init_time_s"][1]
+                )
+            )
+
+        print(f"Initializing crashing env (with init-delay of {sample}sec) ...")
+        time.sleep(sample)
+
+        # Make sure envs don't crash at the same time.
+        self._rng = np.random.RandomState()
+
+    @override(CartPoleEnv)
+    def reset(self, *, seed=None, options=None):
+        # Reset timestep counter for the new episode.
+        self.timesteps = 0
+        self._crash_after_n_steps = None
+
+        # Should we crash?
+        if self._should_crash(p=self.p_crash_reset):
+            raise EnvError(
+                f"Simulated env crash on worker={self.config.worker_index} "
+                f"env-idx={self.config.vector_index} during `reset()`! "
+                "Feel free to use any other exception type here instead."
+            )
+        # Should we stall for a while?
+        self._stall_if_necessary(p=self.p_stall_reset)
+
+        return super().reset()
+
+    @override(CartPoleEnv)
+    def step(self, action):
+        # Increase timestep counter for the ongoing episode.
+        self.timesteps += 1
+
+        # Should we crash?
+        if self._should_crash(p=self.p_crash):
+            raise EnvError(
+                f"Simulated env crash on worker={self.config.worker_index} "
+                f"env-idx={self.config.vector_index} during `step()`! "
+                "Feel free to use any other exception type here instead."
+            )
+        # Should we stall for a while?
+        self._stall_if_necessary(p=self.p_stall)
+
+        return super().step(action)
+
+    def _should_crash(self, p):
+        rnd = self._rng.rand()
+        if rnd < p:
+            print("Crashing due to p(crash)!")
+            return True
+        elif self.crash_after_n_steps is not None:
+            if self._crash_after_n_steps is None:
+                self._crash_after_n_steps = (
+                    self.crash_after_n_steps
+                    if not isinstance(self.crash_after_n_steps, tuple)
+                    else np.random.randint(
+                        self.crash_after_n_steps[0], self.crash_after_n_steps[1]
+                    )
+                )
+            if self._crash_after_n_steps == self.timesteps:
+                print("Crashing due to n timesteps reached!")
+                return True
+
+        return False
+
+    def _stall_if_necessary(self, p):
+        stall = False
+        if self._rng.rand() < p:
+            stall = True
+        elif self.stall_after_n_steps is not None:
+            if self._stall_after_n_steps is None:
+                self._stall_after_n_steps = (
+                    self.stall_after_n_steps
+                    if not isinstance(self.stall_after_n_steps, tuple)
+                    else np.random.randint(
+                        self.stall_after_n_steps[0], self.stall_after_n_steps[1]
+                    )
+                )
+            if self._stall_after_n_steps == self.timesteps:
+                stall = True
+
+        if stall:
+            sec = (
+                self.stall_time_sec
+                if not isinstance(self.stall_time_sec, tuple)
+                else np.random.uniform(self.stall_time_sec[0], self.stall_time_sec[1])
+            )
+            print(f" -> will stall for {sec}sec ...")
+            time.sleep(sec)
+
+
+MultiAgentCartPoleCrashing = make_multi_agent(lambda config: CartPoleCrashing(config))
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_sparse_rewards.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_sparse_rewards.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68f7614c1033ad8c60fe5736e66bdf88f14fc45
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_sparse_rewards.py
@@ -0,0 +1,51 @@
+from copy import deepcopy
+
+import gymnasium as gym
+import numpy as np
+from gymnasium.spaces import Box, Dict, Discrete
+
+
+class CartPoleSparseRewards(gym.Env):
+    """Wrapper for gym CartPole environment where reward is accumulated to the end."""
+
+    def __init__(self, config=None):
+        self.env = gym.make("CartPole-v1")
+        self.action_space = Discrete(2)
+        self.observation_space = Dict(
+            {
+                "obs": self.env.observation_space,
+                "action_mask": Box(
+                    low=0, high=1, shape=(self.action_space.n,), dtype=np.int8
+                ),
+            }
+        )
+        self.running_reward = 0
+
+    def reset(self, *, seed=None, options=None):
+        self.running_reward = 0
+        obs, infos = self.env.reset()
+        return {
+            "obs": obs,
+            "action_mask": np.array([1, 1], dtype=np.int8),
+        }, infos
+
+    def step(self, action):
+        obs, rew, terminated, truncated, info = self.env.step(action)
+        self.running_reward += rew
+        score = self.running_reward if terminated else 0
+        return (
+            {"obs": obs, "action_mask": np.array([1, 1], dtype=np.int8)},
+            score,
+            terminated,
+            truncated,
+            info,
+        )
+
+    def set_state(self, state):
+        self.running_reward = state[1]
+        self.env = deepcopy(state[0])
+        obs = np.array(list(self.env.unwrapped.state))
+        return {"obs": obs, "action_mask": np.array([1, 1], dtype=np.int8)}
+
+    def get_state(self):
+        return deepcopy(self.env), self.running_reward
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_dict_observation_space.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_dict_observation_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..e334f09296da31ecf96fb044cf52891a13f3d11b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_dict_observation_space.py
@@ -0,0 +1,74 @@
+import gymnasium as gym
+from gymnasium.envs.classic_control import CartPoleEnv
+import numpy as np
+
+
+class CartPoleWithDictObservationSpace(CartPoleEnv):
+    """CartPole gym environment that has a dict observation space.
+
+    However, otherwise, the information content in each observation remains the same.
+
+    https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/classic_control/cartpole.py  # noqa
+
+    The new observation space looks as follows (a little quirky, but this is
+    for testing purposes only):
+
+    gym.spaces.Dict({
+        "x-pos": [x-pos],
+        "angular-pos": gym.spaces.Dict({"test": [angular-pos]}),
+        "velocs": gym.spaces.Tuple([x-veloc, angular-veloc]),
+    })
+    """
+
+    def __init__(self, config=None):
+        super().__init__()
+
+        # Fix our observation-space as described above.
+        low = self.observation_space.low
+        high = self.observation_space.high
+
+        # Test as many quirks and oddities as possible: Dict, Dict inside a Dict,
+        # Tuple inside a Dict, and both (1,)-shapes as well as ()-shapes for Boxes.
+        # Also add a random discrete variable here.
+        self.observation_space = gym.spaces.Dict(
+            {
+                "x-pos": gym.spaces.Box(low[0], high[0], (1,), dtype=np.float32),
+                "angular-pos": gym.spaces.Dict(
+                    {
+                        "value": gym.spaces.Box(low[2], high[2], (), dtype=np.float32),
+                        # Add some random non-essential information.
+                        "some_random_stuff": gym.spaces.Discrete(3),
+                    }
+                ),
+                "velocs": gym.spaces.Tuple(
+                    [
+                        # x-veloc
+                        gym.spaces.Box(low[1], high[1], (1,), dtype=np.float32),
+                        # angular-veloc
+                        gym.spaces.Box(low[3], high[3], (), dtype=np.float32),
+                    ]
+                ),
+            }
+        )
+
+    def step(self, action):
+        next_obs, reward, done, truncated, info = super().step(action)
+        return self._compile_current_obs(next_obs), reward, done, truncated, info
+
+    def reset(self, *, seed=None, options=None):
+        init_obs, init_info = super().reset(seed=seed, options=options)
+        return self._compile_current_obs(init_obs), init_info
+
+    def _compile_current_obs(self, original_cartpole_obs):
+        # original_cartpole_obs is [x-pos, x-veloc, angle, angle-veloc]
+        return {
+            "x-pos": np.array([original_cartpole_obs[0]], np.float32),
+            "angular-pos": {
+                "value": original_cartpole_obs[2],
+                "some_random_stuff": np.random.randint(3),
+            },
+            "velocs": (
+                np.array([original_cartpole_obs[1]], np.float32),
+                np.array(original_cartpole_obs[3], np.float32),
+            ),
+        }
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_large_observation_space.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_large_observation_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..162db205658bb9bb78426300b23a6d90913fc89f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_large_observation_space.py
@@ -0,0 +1,69 @@
+import gymnasium as gym
+from gymnasium.envs.classic_control import CartPoleEnv
+import numpy as np
+
+
+class CartPoleWithLargeObservationSpace(CartPoleEnv):
+    """CartPole gym environment that has a large dict observation space.
+
+    However, otherwise, the information content in each observation remains the same.
+
+    https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/classic_control/cartpole.py  # noqa
+
+    The new observation space looks as follows (a little quirky, but this is
+    for testing purposes only):
+
+    gym.spaces.Dict({
+        "1": gym.spaces.Tuple((
+            gym.spaces.Discrete(100),
+            gym.spaces.Box(0, 256, shape=(30,), dtype=float32),
+        )),
+        "2": gym.spaces.Tuple((
+            gym.spaces.Discrete(100),
+            gym.spaces.Box(0, 256, shape=(30,), dtype=float32),
+        )),
+        "3": ...
+        "actual-obs": gym.spaces.Box(-inf, inf, (4,), float32),
+    })
+    """
+
+    def __init__(self, config=None):
+        super().__init__()
+
+        # Fix our observation-space as described above.
+        low = self.observation_space.low
+        high = self.observation_space.high
+
+        # Test as many quirks and oddities as possible: Dict, Dict inside a Dict,
+        # Tuple inside a Dict, and both (1,)-shapes as well as ()-shapes for Boxes.
+        # Also add a random discrete variable here.
+        spaces = {
+            str(i): gym.spaces.Tuple(
+                (
+                    gym.spaces.Discrete(100),
+                    gym.spaces.Box(0, 256, shape=(30,), dtype=np.float32),
+                )
+            )
+            for i in range(100)
+        }
+        spaces.update(
+            {
+                "actually-useful-stuff": (
+                    gym.spaces.Box(low[0], high[0], (4,), np.float32)
+                )
+            }
+        )
+        self.observation_space = gym.spaces.Dict(spaces)
+
+    def step(self, action):
+        next_obs, reward, done, truncated, info = super().step(action)
+        return self._compile_current_obs(next_obs), reward, done, truncated, info
+
+    def reset(self, *, seed=None, options=None):
+        init_obs, init_info = super().reset(seed=seed, options=options)
+        return self._compile_current_obs(init_obs), init_info
+
+    def _compile_current_obs(self, original_cartpole_obs):
+        return {
+            str(i): self.observation_space.spaces[str(i)].sample() for i in range(100)
+        } | {"actually-useful-stuff": original_cartpole_obs}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_protobuf_observation_space.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_protobuf_observation_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..f88b802d37a08ddf2c0972ace4eba8e8315521ea
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_protobuf_observation_space.py
@@ -0,0 +1,79 @@
+import gymnasium as gym
+from gymnasium.envs.classic_control import CartPoleEnv
+import numpy as np
+
+from ray.rllib.examples.envs.classes.utils.cartpole_observations_proto import (
+    CartPoleObservation,
+)
+
+
+class CartPoleWithProtobufObservationSpace(CartPoleEnv):
+    """CartPole gym environment that has a protobuf observation space.
+
+    Sometimes, it is more performant for an environment to publish its observations
+    as a protobuf message (instead of a heavily nested Dict).
+
+    The protobuf message used here is originally defined in the
+    `./utils/cartpole_observations.proto` file. We converted this file into a python
+    importable module by compiling it with:
+
+    `protoc --python_out=. cartpole_observations.proto`
+
+    .. which yielded the `cartpole_observations_proto.py` file in the same directory
+    (we import this file's `CartPoleObservation` message here).
+
+    The new observation space is a (binary) Box(0, 255, ([len of protobuf],), uint8).
+
+    A ConnectorV2 pipeline or simpler gym.Wrapper will have to be used to convert this
+    observation format into an NN-readable (e.g. float32) 1D tensor.
+    """
+
+    def __init__(self, config=None):
+        super().__init__()
+        dummy_obs = self._convert_observation_to_protobuf(
+            np.array([1.0, 1.0, 1.0, 1.0])
+        )
+        bin_length = len(dummy_obs)
+        self.observation_space = gym.spaces.Box(0, 255, (bin_length,), np.uint8)
+
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        proto_observation = self._convert_observation_to_protobuf(observation)
+        return proto_observation, reward, terminated, truncated, info
+
+    def reset(self, **kwargs):
+        observation, info = super().reset(**kwargs)
+        proto_observation = self._convert_observation_to_protobuf(observation)
+        return proto_observation, info
+
+    def _convert_observation_to_protobuf(self, observation):
+        x_pos, x_veloc, angle_pos, angle_veloc = observation
+
+        # Create the Protobuf message
+        cartpole_observation = CartPoleObservation()
+        cartpole_observation.x_pos = x_pos
+        cartpole_observation.x_veloc = x_veloc
+        cartpole_observation.angle_pos = angle_pos
+        cartpole_observation.angle_veloc = angle_veloc
+
+        # Serialize to binary string.
+        return np.frombuffer(cartpole_observation.SerializeToString(), np.uint8)
+
+
+if __name__ == "__main__":
+    env = CartPoleWithProtobufObservationSpace()
+    obs, info = env.reset()
+
+    # Test loading a protobuf object with data from the obs binary string
+    # (uint8 ndarray).
+    byte_str = obs.tobytes()
+    obs_protobuf = CartPoleObservation()
+    obs_protobuf.ParseFromString(byte_str)
+    print(obs_protobuf)
+
+    terminated = truncated = False
+    while not terminated and not truncated:
+        action = env.action_space.sample()
+        obs, reward, terminated, truncated, info = env.step(action)
+
+        print(obs)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cliff_walking_wall_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cliff_walking_wall_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..496e86e3f90782a6227c570c157e74c3c03741a6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cliff_walking_wall_env.py
@@ -0,0 +1,71 @@
+import gymnasium as gym
+from gymnasium import spaces
+
+ACTION_UP = 0
+ACTION_RIGHT = 1
+ACTION_DOWN = 2
+ACTION_LEFT = 3
+
+
+class CliffWalkingWallEnv(gym.Env):
+    """Modified version of the CliffWalking environment from Farama-Foundation's
+    Gymnasium with walls instead of a cliff.
+
+    ### Description
+    The board is a 4x12 matrix, with (using NumPy matrix indexing):
+    - [3, 0] or obs==36 as the start at bottom-left
+    - [3, 11] or obs==47 as the goal at bottom-right
+    - [3, 1..10] or obs==37...46 as the cliff at bottom-center
+
+    An episode terminates when the agent reaches the goal.
+
+    ### Actions
+    There are 4 discrete deterministic actions:
+    - 0: move up
+    - 1: move right
+    - 2: move down
+    - 3: move left
+    You can also use the constants ACTION_UP, ACTION_RIGHT, ... defined above.
+
+    ### Observations
+    There are 3x12 + 2 possible states, not including the walls. If an action
+    would move an agent into one of the walls, it simply stays in the same position.
+
+    ### Reward
+    Each time step incurs -1 reward, except reaching the goal which gives +10 reward.
+    """
+
+    def __init__(self, seed=42) -> None:
+        self.observation_space = spaces.Discrete(48)
+        self.action_space = spaces.Discrete(4)
+        self.observation_space.seed(seed)
+        self.action_space.seed(seed)
+
+    def reset(self, *, seed=None, options=None):
+        self.position = 36
+        return self.position, {}
+
+    def step(self, action):
+        x = self.position // 12
+        y = self.position % 12
+        # UP
+        if action == ACTION_UP:
+            x = max(x - 1, 0)
+        # RIGHT
+        elif action == ACTION_RIGHT:
+            if self.position != 36:
+                y = min(y + 1, 11)
+        # DOWN
+        elif action == ACTION_DOWN:
+            if self.position < 25 or self.position > 34:
+                x = min(x + 1, 3)
+        # LEFT
+        elif action == ACTION_LEFT:
+            if self.position != 47:
+                y = max(y - 1, 0)
+        else:
+            raise ValueError(f"action {action} not in {self.action_space}")
+        self.position = x * 12 + y
+        done = self.position == 47
+        reward = -1 if not done else 10
+        return self.position, reward, done, False, {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/correlated_actions_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/correlated_actions_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b0bdb882fc0bb05fe801ed4999fa8a1a74f5585
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/correlated_actions_env.py
@@ -0,0 +1,79 @@
+from typing import Any, Dict, Optional
+
+import gymnasium as gym
+import numpy as np
+
+
+class CorrelatedActionsEnv(gym.Env):
+    """Environment that can only be solved through an autoregressive action model.
+
+    In each step, the agent observes a random number (between -1 and 1) and has
+    to choose two actions, a1 (discrete, 0, 1, or 2) and a2 (cont. between -1 and 1).
+
+    The reward is constructed such that actions need to be correlated to succeed. It's
+    impossible for the network to learn each action head separately.
+
+    There are two reward components:
+    The first is the negative absolute value of the delta between 1.0 and the sum of
+    obs + a1. For example, if obs is -0.3 and a1 was sampled to be 1, then the value of
+    the first reward component is:
+    r1 = -abs(1.0 - [obs+a1]) = -abs(1.0 - (-0.3 + 1)) = -abs(0.3) = -0.3
+    The second reward component is computed as the negative absolute value
+    of `obs + a1 + a2`. For example, if obs is 0.5, a1 was sampled to be 0,
+    and a2 was sampled to be -0.7, then the value of the second reward component is:
+    r2 = -abs(obs + a1 + a2) = -abs(0.5 + 0 - 0.7)) = -abs(-0.2) = -0.2
+
+    Because of this specific reward function, the agent must learn to optimally sample
+    a1 based on the observation and to optimally sample a2, based on the observation
+    AND the sampled value of a1.
+
+    One way to effectively learn this is through correlated action
+    distributions, e.g., in examples/actions/auto_regressive_actions.py
+
+    The game ends after the first step.
+    """
+
+    def __init__(self, config=None):
+        super().__init__()
+        # Observation space (single continuous value between -1. and 1.).
+        self.observation_space = gym.spaces.Box(-1.0, 1.0, shape=(1,), dtype=np.float32)
+
+        # Action space (discrete action a1 and continuous action a2).
+        self.action_space = gym.spaces.Tuple(
+            [gym.spaces.Discrete(3), gym.spaces.Box(-2.0, 2.0, (1,), np.float32)]
+        )
+
+        # Internal state for the environment (e.g., could represent a factor
+        # influencing the relationship)
+        self.obs = None
+
+    def reset(
+        self, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None
+    ):
+        """Reset the environment to an initial state."""
+        super().reset(seed=seed, options=options)
+
+        # Randomly initialize the observation between -1 and 1.
+        self.obs = np.random.uniform(-1, 1, size=(1,))
+
+        return self.obs, {}
+
+    def step(self, action):
+        """Apply the autoregressive action and return step information."""
+
+        # Extract individual action components, a1 and a2.
+        a1, a2 = action
+        a2 = a2[0]  # dissolve shape=(1,)
+
+        # r1 depends on how well a1 is aligned to obs:
+        r1 = -abs(1.0 - (self.obs[0] + a1))
+        # r2 depends on how well a2 is aligned to both, obs and a1.
+        r2 = -abs(self.obs[0] + a1 + a2)
+
+        reward = r1 + r2
+
+        # Optionally: add some noise or complexity to the reward function
+        # reward += np.random.normal(0, 0.01)  # Small noise can be added
+
+        # Terminate after each step (no episode length in this simple example)
+        return self.obs, reward, True, False, {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/d4rl_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/d4rl_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..f77434589b92b05635e40907d2686c5bd8e82ee3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/d4rl_env.py
@@ -0,0 +1,46 @@
+"""
+8 Environments from D4RL Environment.
+Use fully qualified class-path in your configs:
+e.g. "env": "ray.rllib.examples.envs.classes.d4rl_env.halfcheetah_random".
+"""
+
+import gymnasium as gym
+
+try:
+    import d4rl
+
+    d4rl.__name__  # Fool LINTer.
+except ImportError:
+    d4rl = None
+
+
+def halfcheetah_random():
+    return gym.make("halfcheetah-random-v0")
+
+
+def halfcheetah_medium():
+    return gym.make("halfcheetah-medium-v0")
+
+
+def halfcheetah_expert():
+    return gym.make("halfcheetah-expert-v0")
+
+
+def halfcheetah_medium_replay():
+    return gym.make("halfcheetah-medium-replay-v0")
+
+
+def hopper_random():
+    return gym.make("hopper-random-v0")
+
+
+def hopper_medium():
+    return gym.make("hopper-medium-v0")
+
+
+def hopper_expert():
+    return gym.make("hopper-expert-v0")
+
+
+def hopper_medium_replay():
+    return gym.make("hopper-medium-replay-v0")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/debug_counter_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/debug_counter_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..404833e18d8d81bb6cfd3b7318a80233524262c5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/debug_counter_env.py
@@ -0,0 +1,92 @@
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+
+
+class DebugCounterEnv(gym.Env):
+    """Simple Env that yields a ts counter as observation (0-based).
+
+    Actions have no effect.
+    The episode length is always 15.
+    Reward is always: current ts % 3.
+    """
+
+    def __init__(self, config=None):
+        config = config or {}
+        self.action_space = gym.spaces.Discrete(2)
+        self.observation_space = gym.spaces.Box(0, 100, (1,), dtype=np.float32)
+        self.start_at_t = int(config.get("start_at_t", 0))
+        self.i = self.start_at_t
+
+    def reset(self, *, seed=None, options=None):
+        self.i = self.start_at_t
+        return self._get_obs(), {}
+
+    def step(self, action):
+        self.i += 1
+        terminated = False
+        truncated = self.i >= 15 + self.start_at_t
+        return self._get_obs(), float(self.i % 3), terminated, truncated, {}
+
+    def _get_obs(self):
+        return np.array([self.i], dtype=np.float32)
+
+
+class MultiAgentDebugCounterEnv(MultiAgentEnv):
+    def __init__(self, config):
+        super().__init__()
+        self.num_agents = config["num_agents"]
+        self.base_episode_len = config.get("base_episode_len", 103)
+
+        # Observation dims:
+        # 0=agent ID.
+        # 1=episode ID (0.0 for obs after reset).
+        # 2=env ID (0.0 for obs after reset).
+        # 3=ts (of the agent).
+        self.observation_space = gym.spaces.Dict(
+            {
+                aid: gym.spaces.Box(float("-inf"), float("inf"), (4,))
+                for aid in range(self.num_agents)
+            }
+        )
+
+        # Actions are always:
+        # (episodeID, envID) as floats.
+        self.action_space = gym.spaces.Dict(
+            {
+                aid: gym.spaces.Box(-float("inf"), float("inf"), shape=(2,))
+                for aid in range(self.num_agents)
+            }
+        )
+
+        self.timesteps = [0] * self.num_agents
+        self.terminateds = set()
+        self.truncateds = set()
+
+    def reset(self, *, seed=None, options=None):
+        self.timesteps = [0] * self.num_agents
+        self.terminateds = set()
+        self.truncateds = set()
+        return {
+            i: np.array([i, 0.0, 0.0, 0.0], dtype=np.float32)
+            for i in range(self.num_agents)
+        }, {}
+
+    def step(self, action_dict):
+        obs, rew, terminated, truncated = {}, {}, {}, {}
+        for i, action in action_dict.items():
+            self.timesteps[i] += 1
+            obs[i] = np.array([i, action[0], action[1], self.timesteps[i]])
+            rew[i] = self.timesteps[i] % 3
+            terminated[i] = False
+            truncated[i] = (
+                True if self.timesteps[i] > self.base_episode_len + i else False
+            )
+            if terminated[i]:
+                self.terminateds.add(i)
+            if truncated[i]:
+                self.truncateds.add(i)
+        terminated["__all__"] = len(self.terminateds) == self.num_agents
+        truncated["__all__"] = len(self.truncateds) == self.num_agents
+        return obs, rew, terminated, truncated, {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/deterministic_envs.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/deterministic_envs.py
new file mode 100644
index 0000000000000000000000000000000000000000..51f41f29fb3b0981f4d97106b678eec4e230eccc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/deterministic_envs.py
@@ -0,0 +1,13 @@
+import gymnasium as gym
+
+
+def create_cartpole_deterministic(config):
+    env = gym.make("CartPole-v1")
+    env.reset(seed=config.get("seed", 0))
+    return env
+
+
+def create_pendulum_deterministic(config):
+    env = gym.make("Pendulum-v1")
+    env.reset(seed=config.get("seed", 0))
+    return env
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/dm_control_suite.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/dm_control_suite.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a1425fac214b8a28d4e169a363a8f5dfcaa9dc5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/dm_control_suite.py
@@ -0,0 +1,131 @@
+from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv
+
+"""
+8 Environments from Deepmind Control Suite
+"""
+
+
+def acrobot_swingup(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "acrobot",
+        "swingup",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+
+
+def walker_walk(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "walker",
+        "walk",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+
+
+def hopper_hop(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "hopper",
+        "hop",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+
+
+def hopper_stand(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "hopper",
+        "stand",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+
+
+def cheetah_run(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "cheetah",
+        "run",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+
+
+def walker_run(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "walker",
+        "run",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+
+
+def pendulum_swingup(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "pendulum",
+        "swingup",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+
+
+def cartpole_swingup(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "cartpole",
+        "swingup",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+
+
+def humanoid_walk(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "humanoid",
+        "walk",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_using_remote_actor.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_using_remote_actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b52242c9679abbad559db9e586310e93f9bb8928
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_using_remote_actor.py
@@ -0,0 +1,63 @@
+"""
+Example of an environment that uses a named remote actor as parameter
+server.
+
+"""
+from gymnasium.envs.classic_control.cartpole import CartPoleEnv
+from gymnasium.utils import seeding
+
+import ray
+
+
+@ray.remote
+class ParameterStorage:
+    def get_params(self, rng):
+        return {
+            "MASSCART": rng.uniform(low=0.5, high=2.0),
+        }
+
+
+class CartPoleWithRemoteParamServer(CartPoleEnv):
+    """CartPoleMassEnv varies the weights of the cart and the pole."""
+
+    def __init__(self, env_config):
+        self.env_config = env_config
+        super().__init__()
+        # Get our param server (remote actor) by name.
+        self._handler = ray.get_actor(env_config.get("param_server", "param-server"))
+        self.rng_seed = None
+        self.np_random, _ = seeding.np_random(self.rng_seed)
+
+    def reset(self, *, seed=None, options=None):
+        if seed is not None:
+            self.rng_seed = int(seed)
+            self.np_random, _ = seeding.np_random(seed)
+            print(
+                f"Seeding env (worker={self.env_config.worker_index}) " f"with {seed}"
+            )
+
+        # Pass in our RNG to guarantee no race conditions.
+        # If `self._handler` had its own RNG, this may clash with other
+        # envs trying to use the same param-server.
+        params = ray.get(self._handler.get_params.remote(self.np_random))
+
+        # IMPORTANT: Advance the state of our RNG (self._rng was passed
+        # above via ray (serialized) and thus not altered locally here!).
+        # Or create a new RNG from another random number:
+        # Seed the RNG with a deterministic seed if set, otherwise, create
+        # a random one.
+        new_seed = int(
+            self.np_random.integers(0, 1000000) if not self.rng_seed else self.rng_seed
+        )
+        self.np_random, _ = seeding.np_random(new_seed)
+
+        print(
+            f"Env worker-idx={self.env_config.worker_index} "
+            f"mass={params['MASSCART']}"
+        )
+
+        self.masscart = params["MASSCART"]
+        self.total_mass = self.masspole + self.masscart
+        self.polemass_length = self.masspole * self.length
+
+        return super().reset()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_with_subprocess.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_with_subprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..424f1eb095074a834f39d9159581dfddb190996f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_with_subprocess.py
@@ -0,0 +1,42 @@
+import atexit
+import gymnasium as gym
+from gymnasium.spaces import Discrete
+import os
+import subprocess
+
+
+class EnvWithSubprocess(gym.Env):
+    """An env that spawns a subprocess."""
+
+    # Dummy command to run as a subprocess with a unique name
+    UNIQUE_CMD = "sleep 20"
+
+    def __init__(self, config):
+        self.UNIQUE_FILE_0 = config["tmp_file1"]
+        self.UNIQUE_FILE_1 = config["tmp_file2"]
+        self.UNIQUE_FILE_2 = config["tmp_file3"]
+        self.UNIQUE_FILE_3 = config["tmp_file4"]
+
+        self.action_space = Discrete(2)
+        self.observation_space = Discrete(2)
+        # Subprocess that should be cleaned up.
+        self.subproc = subprocess.Popen(self.UNIQUE_CMD.split(" "), shell=False)
+        self.config = config
+        # Exit handler should be called.
+        atexit.register(lambda: self.subproc.kill())
+        if config.worker_index == 0:
+            atexit.register(lambda: os.unlink(self.UNIQUE_FILE_0))
+        else:
+            atexit.register(lambda: os.unlink(self.UNIQUE_FILE_1))
+
+    def close(self):
+        if self.config.worker_index == 0:
+            os.unlink(self.UNIQUE_FILE_2)
+        else:
+            os.unlink(self.UNIQUE_FILE_3)
+
+    def reset(self, *, seed=None, options=None):
+        return 0, {}
+
+    def step(self, action):
+        return 0, 0, True, False, {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/fast_image_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/fast_image_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eaad9a8fe81966ad42971b598bfcf6d125c11c7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/fast_image_env.py
@@ -0,0 +1,20 @@
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+
+
+class FastImageEnv(gym.Env):
+    def __init__(self, config):
+        self.zeros = np.zeros((84, 84, 4))
+        self.action_space = Discrete(2)
+        self.observation_space = Box(0.0, 1.0, shape=(84, 84, 4), dtype=np.float32)
+        self.i = 0
+
+    def reset(self, *, seed=None, options=None):
+        self.i = 0
+        return self.zeros, {}
+
+    def step(self, action):
+        self.i += 1
+        done = truncated = self.i > 1000
+        return self.zeros, 1, done, truncated, {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/gpu_requiring_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/gpu_requiring_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e08f3489a901dd6b4cc3612c0e520c5684462f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/gpu_requiring_env.py
@@ -0,0 +1,37 @@
+import numpy as np
+
+import ray
+from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor
+from ray.rllib.utils.framework import try_import_torch
+
+torch, _ = try_import_torch()
+
+
+class GPURequiringEnv(SimpleCorridor):
+    """A dummy env that requires a GPU in order to work.
+
+    The env here is a simple corridor env that additionally simulates a GPU
+    check in its constructor via `ray.get_gpu_ids()`. If this returns an
+    empty list, we raise an error.
+
+    To make this env work, use `num_gpus_per_env_runner > 0` (RolloutWorkers
+    requesting this many GPUs each) and - maybe - `num_gpus > 0` in case
+    your local worker/driver must have an env as well. However, this is
+    only the case if `create_env_on_driver`=True (default is False).
+    """
+
+    def __init__(self, config=None):
+        super().__init__(config)
+
+        # Fake-require some GPUs (at least one).
+        # If your local worker's env (`create_env_on_driver`=True) does not
+        # necessarily require a GPU, you can perform the below assertion only
+        # if `config.worker_index != 0`.
+        gpus_available = ray.get_gpu_ids()
+        print(f"{type(self).__name__} can see GPUs={gpus_available}")
+
+        # Create a dummy tensor on the GPU.
+        if len(gpus_available) > 0 and torch:
+            self._tensor = torch.from_numpy(np.random.random_sample(size=(42, 42))).to(
+                f"cuda:{gpus_available[0]}"
+            )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/look_and_push.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/look_and_push.py
new file mode 100644
index 0000000000000000000000000000000000000000..46c77b620e8a472499bf23a32c8da178c49a929e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/look_and_push.py
@@ -0,0 +1,65 @@
+import gymnasium as gym
+import numpy as np
+
+
+class LookAndPush(gym.Env):
+    """Memory-requiring Env: Best sequence of actions depends on prev. states.
+
+    Optimal behavior:
+        0) a=0 -> observe next state (s'), which is the "hidden" state.
+            If a=1 here, the hidden state is not observed.
+        1) a=1 to always jump to s=2 (not matter what the prev. state was).
+        2) a=1 to move to s=3.
+        3) a=1 to move to s=4.
+        4) a=0 OR 1 depending on s' observed after 0): +10 reward and done.
+            otherwise: -10 reward and done.
+    """
+
+    def __init__(self):
+        self.action_space = gym.spaces.Discrete(2)
+        self.observation_space = gym.spaces.Discrete(5)
+        self._state = None
+        self._case = None
+
+    def reset(self, *, seed=None, options=None):
+        self._state = 2
+        self._case = np.random.choice(2)
+        return self._state, {}
+
+    def step(self, action):
+        assert self.action_space.contains(action)
+
+        if self._state == 4:
+            if action and self._case:
+                return self._state, 10.0, True, {}
+            else:
+                return self._state, -10, True, {}
+        else:
+            if action:
+                if self._state == 0:
+                    self._state = 2
+                else:
+                    self._state += 1
+            elif self._state == 2:
+                self._state = self._case
+
+        return self._state, -1, False, False, {}
+
+
+class OneHot(gym.Wrapper):
+    def __init__(self, env):
+        super(OneHot, self).__init__(env)
+        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n,))
+
+    def reset(self, *, seed=None, options=None):
+        obs, info = self.env.reset(seed=seed, options=options)
+        return self._encode_obs(obs), info
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        return self._encode_obs(obs), reward, terminated, truncated, info
+
+    def _encode_obs(self, obs):
+        new_obs = np.ones(self.env.observation_space.n)
+        new_obs[obs] = 1.0
+        return new_obs
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/memory_leaking_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/memory_leaking_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..c97edb296691f14a9566c7e6645d47049410cd9f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/memory_leaking_env.py
@@ -0,0 +1,35 @@
+import logging
+import uuid
+
+from ray.rllib.examples.envs.classes.random_env import RandomEnv
+from ray.rllib.utils.annotations import override
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryLeakingEnv(RandomEnv):
+    """An env that leaks very little memory.
+
+    Useful for proving that our memory-leak tests can catch the
+    slightest leaks.
+    """
+
+    def __init__(self, config=None):
+        super().__init__(config)
+        self._leak = {}
+        self._steps_after_reset = 0
+
+    @override(RandomEnv)
+    def reset(self, *, seed=None, options=None):
+        self._steps_after_reset = 0
+        return super().reset(seed=seed, options=options)
+
+    @override(RandomEnv)
+    def step(self, action):
+        self._steps_after_reset += 1
+
+        # Only leak once an episode.
+        if self._steps_after_reset == 2:
+            self._leak[uuid.uuid4().hex.upper()] = 1
+
+        return super().step(action)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/mock_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/mock_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d8b26935c3a1aa831ad912316f81c0246c7747
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/mock_env.py
@@ -0,0 +1,220 @@
+import gymnasium as gym
+import numpy as np
+from typing import Optional
+
+from ray.rllib.env.vector_env import VectorEnv
+from ray.rllib.utils.annotations import override
+
+
+class MockEnv(gym.Env):
+    """Mock environment for testing purposes.
+
+    Observation=0, reward=1.0, episode-len is configurable.
+    Actions are ignored.
+    """
+
+    def __init__(self, episode_length, config=None):
+        self.episode_length = episode_length
+        self.config = config
+        self.i = 0
+        self.observation_space = gym.spaces.Discrete(1)
+        self.action_space = gym.spaces.Discrete(2)
+
+    def reset(self, *, seed=None, options=None):
+        self.i = 0
+        return 0, {}
+
+    def step(self, action):
+        self.i += 1
+        terminated = truncated = self.i >= self.episode_length
+        return 0, 1.0, terminated, truncated, {}
+
+
+class MockEnv2(gym.Env):
+    """Mock environment for testing purposes.
+
+    Observation=ts (discrete space!), reward=100.0, episode-len is
+    configurable. Actions are ignored.
+    """
+
+    metadata = {
+        "render.modes": ["rgb_array"],
+    }
+    render_mode: Optional[str] = "rgb_array"
+
+    def __init__(self, episode_length):
+        self.episode_length = episode_length
+        self.i = 0
+        self.observation_space = gym.spaces.Discrete(self.episode_length + 1)
+        self.action_space = gym.spaces.Discrete(2)
+        self.rng_seed = None
+
+    def reset(self, *, seed=None, options=None):
+        self.i = 0
+        if seed is not None:
+            self.rng_seed = seed
+        return self.i, {}
+
+    def step(self, action):
+        self.i += 1
+        terminated = truncated = self.i >= self.episode_length
+        return self.i, 100.0, terminated, truncated, {}
+
+    def render(self):
+        # Just generate a random image here for demonstration purposes.
+        # Also see `gym/envs/classic_control/cartpole.py` for
+        # an example on how to use a Viewer object.
+        return np.random.randint(0, 256, size=(300, 400, 3), dtype=np.uint8)
+
+
+class MockEnv3(gym.Env):
+    """Mock environment for testing purposes.
+
+    Observation=ts (discrete space!), reward=100.0, episode-len is
+    configurable. Actions are ignored.
+    """
+
+    def __init__(self, episode_length):
+        self.episode_length = episode_length
+        self.i = 0
+        self.observation_space = gym.spaces.Discrete(100)
+        self.action_space = gym.spaces.Discrete(2)
+
+    def reset(self, *, seed=None, options=None):
+        self.i = 0
+        return self.i, {"timestep": 0}
+
+    def step(self, action):
+        self.i += 1
+        terminated = truncated = self.i >= self.episode_length
+        return self.i, self.i, terminated, truncated, {"timestep": self.i}
+
+
+class VectorizedMockEnv(VectorEnv):
+    """Vectorized version of the MockEnv.
+
+    Contains `num_envs` MockEnv instances, each one having its own
+    `episode_length` horizon.
+    """
+
+    def __init__(self, episode_length, num_envs):
+        super().__init__(
+            observation_space=gym.spaces.Discrete(1),
+            action_space=gym.spaces.Discrete(2),
+            num_envs=num_envs,
+        )
+        self.envs = [MockEnv(episode_length) for _ in range(num_envs)]
+
+    @override(VectorEnv)
+    def vector_reset(self, *, seeds=None, options=None):
+        seeds = seeds or [None] * self.num_envs
+        options = options or [None] * self.num_envs
+        obs_and_infos = [
+            e.reset(seed=seeds[i], options=options[i]) for i, e in enumerate(self.envs)
+        ]
+        return [oi[0] for oi in obs_and_infos], [oi[1] for oi in obs_and_infos]
+
+    @override(VectorEnv)
+    def reset_at(self, index, *, seed=None, options=None):
+        return self.envs[index].reset(seed=seed, options=options)
+
+    @override(VectorEnv)
+    def vector_step(self, actions):
+        obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch = (
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+        for i in range(len(self.envs)):
+            obs, rew, terminated, truncated, info = self.envs[i].step(actions[i])
+            obs_batch.append(obs)
+            rew_batch.append(rew)
+            terminated_batch.append(terminated)
+            truncated_batch.append(truncated)
+            info_batch.append(info)
+        return obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch
+
+    @override(VectorEnv)
+    def get_sub_environments(self):
+        return self.envs
+
+
+class MockVectorEnv(VectorEnv):
+    """A custom vector env that uses a single(!) CartPole sub-env.
+
+    However, this env pretends to be a vectorized one to illustrate how one
+    could create custom VectorEnvs w/o the need for actual vectorizations of
+    sub-envs under the hood.
+    """
+
+    def __init__(self, episode_length, mocked_num_envs):
+        self.env = gym.make("CartPole-v1")
+        super().__init__(
+            observation_space=self.env.observation_space,
+            action_space=self.env.action_space,
+            num_envs=mocked_num_envs,
+        )
+        self.episode_len = episode_length
+        self.ts = 0
+
+    @override(VectorEnv)
+    def vector_reset(self, *, seeds=None, options=None):
+        # Since we only have one underlying sub-environment, just use the first seed
+        # and the first options dict (the user of this env thinks, there are
+        # `self.num_envs` sub-environments and sends that many seeds/options).
+        seeds = seeds or [None]
+        options = options or [None]
+        obs, infos = self.env.reset(seed=seeds[0], options=options[0])
+        # Simply repeat the single obs/infos to pretend we really have
+        # `self.num_envs` sub-environments.
+        return (
+            [obs for _ in range(self.num_envs)],
+            [infos for _ in range(self.num_envs)],
+        )
+
+    @override(VectorEnv)
+    def reset_at(self, index, *, seed=None, options=None):
+        self.ts = 0
+        return self.env.reset(seed=seed, options=options)
+
+    @override(VectorEnv)
+    def vector_step(self, actions):
+        self.ts += 1
+        # Apply all actions sequentially to the same env.
+        # Whether this would make a lot of sense is debatable.
+        obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch = (
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+        for i in range(self.num_envs):
+            obs, rew, terminated, truncated, info = self.env.step(actions[i])
+            # Artificially truncate once time step limit has been reached.
+            # Note: Also terminate/truncate, when underlying CartPole is
+            # terminated/truncated.
+            if self.ts >= self.episode_len:
+                truncated = True
+            obs_batch.append(obs)
+            rew_batch.append(rew)
+            terminated_batch.append(terminated)
+            truncated_batch.append(truncated)
+            info_batch.append(info)
+            if terminated or truncated:
+                remaining = self.num_envs - (i + 1)
+                obs_batch.extend([obs for _ in range(remaining)])
+                rew_batch.extend([rew for _ in range(remaining)])
+                terminated_batch.extend([terminated for _ in range(remaining)])
+                truncated_batch.extend([truncated for _ in range(remaining)])
+                info_batch.extend([info for _ in range(remaining)])
+                break
+        return obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch
+
+    @override(VectorEnv)
+    def get_sub_environments(self):
+        # You may also leave this method as-is, in which case, it would
+        # return an empty list.
+        return [self.env for _ in range(self.num_envs)]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_discrete.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7ceb11ebfc39e1df470fa13d76d0f1c5058460a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_discrete.py
@@ -0,0 +1,206 @@
+import copy
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+import random
+
+
+class SimpleContextualBandit(gym.Env):
+    """Simple env w/ 2 states and 3 actions (arms): 0, 1, and 2.
+
+    Episodes last only for one timestep, possible observations are:
+    [-1.0, 1.0] and [1.0, -1.0], where the first element is the "current context".
+    The highest reward (+10.0) is received for selecting arm 0 for context=1.0
+    and arm 2 for context=-1.0. Action 1 always yields 0.0 reward.
+    """
+
+    def __init__(self, config=None):
+        self.action_space = Discrete(3)
+        self.observation_space = Box(low=-1.0, high=1.0, shape=(2,))
+        self.cur_context = None
+
+    def reset(self, *, seed=None, options=None):
+        self.cur_context = random.choice([-1.0, 1.0])
+        return np.array([self.cur_context, -self.cur_context]), {}
+
+    def step(self, action):
+        rewards_for_context = {
+            -1.0: [-10, 0, 10],
+            1.0: [10, 0, -10],
+        }
+        reward = rewards_for_context[self.cur_context][action]
+        return (
+            np.array([-self.cur_context, self.cur_context]),
+            reward,
+            True,
+            False,
+            {"regret": 10 - reward},
+        )
+
+
+class LinearDiscreteEnv(gym.Env):
+    """Samples data from linearly parameterized arms.
+
+    The reward for context X and arm i is given by X^T * theta_i, for some
+    latent set of parameters {theta_i : i = 1, ..., k}.
+    The thetas are sampled uniformly at random, the contexts are Gaussian,
+    and Gaussian noise is added to the rewards.
+    """
+
+    DEFAULT_CONFIG_LINEAR = {
+        "feature_dim": 8,
+        "num_actions": 4,
+        "reward_noise_std": 0.01,
+    }
+
+    def __init__(self, config=None):
+        self.config = copy.copy(self.DEFAULT_CONFIG_LINEAR)
+        if config is not None and type(config) is dict:
+            self.config.update(config)
+
+        self.feature_dim = self.config["feature_dim"]
+        self.num_actions = self.config["num_actions"]
+        self.sigma = self.config["reward_noise_std"]
+
+        self.action_space = Discrete(self.num_actions)
+        self.observation_space = Box(low=-10, high=10, shape=(self.feature_dim,))
+
+        self.thetas = np.random.uniform(-1, 1, (self.num_actions, self.feature_dim))
+        self.thetas /= np.linalg.norm(self.thetas, axis=1, keepdims=True)
+
+        self._elapsed_steps = 0
+        self._current_context = None
+
+    def _sample_context(self):
+        return np.random.normal(scale=1 / 3, size=(self.feature_dim,))
+
+    def reset(self, *, seed=None, options=None):
+        self._current_context = self._sample_context()
+        return self._current_context, {}
+
+    def step(self, action):
+        assert (
+            self._elapsed_steps is not None
+        ), "Cannot call env.step() beforecalling reset()"
+        assert action < self.num_actions, "Invalid action."
+
+        action = int(action)
+        context = self._current_context
+        rewards = self.thetas.dot(context)
+
+        opt_action = rewards.argmax()
+
+        regret = rewards.max() - rewards[action]
+
+        # Add Gaussian noise
+        rewards += np.random.normal(scale=self.sigma, size=rewards.shape)
+
+        reward = rewards[action]
+        self._current_context = self._sample_context()
+        return (
+            self._current_context,
+            reward,
+            True,
+            False,
+            {"regret": regret, "opt_action": opt_action},
+        )
+
+    def render(self, mode="human"):
+        raise NotImplementedError
+
+
+class WheelBanditEnv(gym.Env):
+    """Wheel bandit environment for 2D contexts
+    (see https://arxiv.org/abs/1802.09127).
+    """
+
+    DEFAULT_CONFIG_WHEEL = {
+        "delta": 0.5,
+        "mu_1": 1.2,
+        "mu_2": 1,
+        "mu_3": 50,
+        "std": 0.01,
+    }
+
+    feature_dim = 2
+    num_actions = 5
+
+    def __init__(self, config=None):
+        self.config = copy.copy(self.DEFAULT_CONFIG_WHEEL)
+        if config is not None and type(config) is dict:
+            self.config.update(config)
+
+        self.delta = self.config["delta"]
+        self.mu_1 = self.config["mu_1"]
+        self.mu_2 = self.config["mu_2"]
+        self.mu_3 = self.config["mu_3"]
+        self.std = self.config["std"]
+
+        self.action_space = Discrete(self.num_actions)
+        self.observation_space = Box(low=-1, high=1, shape=(self.feature_dim,))
+
+        self.means = [self.mu_1] + 4 * [self.mu_2]
+        self._elapsed_steps = 0
+        self._current_context = None
+
+    def _sample_context(self):
+        while True:
+            state = np.random.uniform(-1, 1, self.feature_dim)
+            if np.linalg.norm(state) <= 1:
+                return state
+
+    def reset(self, *, seed=None, options=None):
+        self._current_context = self._sample_context()
+        return self._current_context, {}
+
+    def step(self, action):
+        assert (
+            self._elapsed_steps is not None
+        ), "Cannot call env.step() before calling reset()"
+
+        action = int(action)
+        self._elapsed_steps += 1
+        rewards = [
+            np.random.normal(self.means[j], self.std) for j in range(self.num_actions)
+        ]
+        context = self._current_context
+        r_big = np.random.normal(self.mu_3, self.std)
+
+        if np.linalg.norm(context) >= self.delta:
+            if context[0] > 0:
+                if context[1] > 0:
+                    # First quadrant
+                    rewards[1] = r_big
+                    opt_action = 1
+                else:
+                    # Fourth quadrant
+                    rewards[4] = r_big
+                    opt_action = 4
+            else:
+                if context[1] > 0:
+                    # Second quadrant
+                    rewards[2] = r_big
+                    opt_action = 2
+                else:
+                    # Third quadrant
+                    rewards[3] = r_big
+                    opt_action = 3
+        else:
+            # Smaller region where action 0 is optimal
+            opt_action = 0
+
+        reward = rewards[action]
+
+        regret = rewards[opt_action] - reward
+
+        self._current_context = self._sample_context()
+        return (
+            self._current_context,
+            reward,
+            True,
+            False,
+            {"regret": regret, "opt_action": opt_action},
+        )
+
+    def render(self, mode="human"):
+        raise NotImplementedError
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/guess_the_number_game.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/guess_the_number_game.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaac3e4becc55e06efa0d618fa8b5df032be07cd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/guess_the_number_game.py
@@ -0,0 +1,89 @@
+import gymnasium as gym
+
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+
+
+class GuessTheNumberGame(MultiAgentEnv):
+    """
+    We have two players, 0 and 1. Agent 0 has to pick a number between 0, MAX-1
+    at reset. Agent 1 has to guess the number by asking N questions of whether
+    of the form of "a <number> is higher|lower|equal to the picked number. The
+    action space is MultiDiscrete [3, MAX]. For the first index 0 means lower,
+    1 means higher and 2 means equal. The environment answers with yes (1) or
+    no (0) on the reward function. Every time step that agent 1 wastes agent 0
+    gets a reward of 1. After N steps the game is terminated. If agent 1
+    guesses the number correctly, it gets a reward of 100 points, otherwise it
+    gets a reward of 0. On the other hand if agent 0 wins they win 100 points.
+    The optimal policy controlling agent 1 should converge to a binary search
+    strategy.
+    """
+
+    MAX_NUMBER = 3
+    MAX_STEPS = 20
+
+    def __init__(self, config=None):
+        super().__init__()
+        self._agent_ids = {0, 1}
+
+        self.max_number = config.get("max_number", self.MAX_NUMBER)
+        self.max_steps = config.get("max_steps", self.MAX_STEPS)
+
+        self._number = None
+        self.observation_space = gym.spaces.Discrete(2)
+        self.action_space = gym.spaces.MultiDiscrete([3, self.max_number])
+
+    def reset(self, *, seed=None, options=None):
+        self._step = 0
+        self._number = None
+        # agent 0 has to pick a number. So the returned obs does not matter.
+        return {0: 0}, {}
+
+    def step(self, action_dict):
+        # get agent 0's action
+        agent_0_action = action_dict.get(0)
+
+        if agent_0_action is not None:
+            # ignore the first part of the action and look at the number
+            self._number = agent_0_action[1]
+            # next obs should tell agent 1 to start guessing.
+            # the returned reward and dones should be on agent 0 who picked a
+            # number.
+            return (
+                {1: 0},
+                {0: 0},
+                {0: False, "__all__": False},
+                {0: False, "__all__": False},
+                {},
+            )
+
+        if self._number is None:
+            raise ValueError(
+                "No number is selected by agent 0. Have you restarted "
+                "the environment?"
+            )
+
+        # get agent 1's action
+        direction, number = action_dict.get(1)
+        info = {}
+        # always the same, we don't need agent 0 to act ever again, agent 1 should keep
+        # guessing.
+        obs = {1: 0}
+        guessed_correctly = False
+        terminated = {1: False, "__all__": False}
+        truncated = {1: False, "__all__": False}
+        # everytime agent 1 does not guess correctly agent 0 gets a reward of 1.
+        if direction == 0:  # lower
+            reward = {1: int(number > self._number), 0: 1}
+        elif direction == 1:  # higher
+            reward = {1: int(number < self._number), 0: 1}
+        else:  # equal
+            guessed_correctly = number == self._number
+            reward = {1: guessed_correctly * 100, 0: guessed_correctly * -100}
+            terminated = {1: guessed_correctly, "__all__": guessed_correctly}
+
+        self._step += 1
+        if self._step >= self.max_steps:  # max number of steps episode is over
+            truncated["__all__"] = True
+            if not guessed_correctly:
+                reward[0] = 100  # agent 0 wins
+        return obs, reward, terminated, truncated, info
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_chess.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_chess.py
new file mode 100644
index 0000000000000000000000000000000000000000..697ab01157f572d93539c143cbda31e0233045a1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_chess.py
@@ -0,0 +1,227 @@
+from pettingzoo import AECEnv
+from pettingzoo.classic.chess.chess import raw_env as chess_v5
+import copy
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+from typing import Dict, Any
+import chess as ch
+import numpy as np
+
+
+class MultiAgentChess(MultiAgentEnv):
+    """An interface to the PettingZoo MARL environment library.
+    See: https://github.com/Farama-Foundation/PettingZoo
+    Inherits from MultiAgentEnv and exposes a given AEC
+    (actor-environment-cycle) game from the PettingZoo project via the
+    MultiAgentEnv public API.
+    Note that the wrapper has some important limitations:
+    1. All agents have the same action_spaces and observation_spaces.
+       Note: If, within your aec game, agents do not have homogeneous action /
+       observation spaces, apply SuperSuit wrappers
+       to apply padding functionality: https://github.com/Farama-Foundation/
+       SuperSuit#built-in-multi-agent-only-functions
+    2. Environments are positive sum games (-> Agents are expected to cooperate
+       to maximize reward). This isn't a hard restriction, it just that
+       standard algorithms aren't expected to work well in highly competitive
+       games.
+
+    .. testcode::
+        :skipif: True
+
+        from pettingzoo.butterfly import prison_v3
+        from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
+        env = PettingZooEnv(prison_v3.env())
+        obs = env.reset()
+        print(obs)
+        # only returns the observation for the agent which should be stepping
+
+    .. testoutput::
+
+        {
+            'prisoner_0': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+
+    .. testcode::
+        :skipif: True
+
+        obs, rewards, dones, infos = env.step({
+                        "prisoner_0": 1
+                    })
+        # only returns the observation, reward, info, etc, for
+        # the agent who's turn is next.
+        print(obs)
+
+    .. testoutput::
+
+        {
+            'prisoner_1': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+
+    .. testcode::
+        :skipif: True
+
+        print(rewards)
+
+    .. testoutput::
+
+        {
+            'prisoner_1': 0
+        }
+
+    .. testcode::
+        :skipif: True
+
+        print(dones)
+
+    .. testoutput::
+
+        {
+            'prisoner_1': False, '__all__': False
+        }
+
+    .. testcode::
+        :skipif: True
+
+        print(infos)
+
+    .. testoutput::
+
+        {
+            'prisoner_1': {'map_tuple': (1, 0)}
+        }
+    """
+
+    def __init__(
+        self,
+        config: Dict[Any, Any] = None,
+        env: AECEnv = None,
+    ):
+        super().__init__()
+        if env is None:
+            self.env = chess_v5()
+        else:
+            self.env = env
+        self.env.reset()
+
+        self.config = config
+        if self.config is None:
+            self.config = {}
+        try:
+            self.config["random_start"] = self.config["random_start"]
+        except KeyError:
+            self.config["random_start"] = 4
+        # Get first observation space, assuming all agents have equal space
+        self.observation_space = self.env.observation_space(self.env.agents[0])
+
+        # Get first action space, assuming all agents have equal space
+        self.action_space = self.env.action_space(self.env.agents[0])
+
+        assert all(
+            self.env.observation_space(agent) == self.observation_space
+            for agent in self.env.agents
+        ), (
+            "Observation spaces for all agents must be identical. Perhaps "
+            "SuperSuit's pad_observations wrapper can help (useage: "
+            "`supersuit.aec_wrappers.pad_observations(env)`"
+        )
+
+        assert all(
+            self.env.action_space(agent) == self.action_space
+            for agent in self.env.agents
+        ), (
+            "Action spaces for all agents must be identical. Perhaps "
+            "SuperSuit's pad_action_space wrapper can help (usage: "
+            "`supersuit.aec_wrappers.pad_action_space(env)`"
+        )
+        self._agent_ids = set(self.env.agents)
+
+    def random_start(self, random_moves):
+        self.env.board = ch.Board()
+        for i in range(random_moves):
+            self.env.board.push(np.random.choice(list(self.env.board.legal_moves)))
+        return self.env.board
+
+    def observe(self):
+        return {
+            self.env.agent_selection: self.env.observe(self.env.agent_selection),
+            "state": self.get_state(),
+        }
+
+    def reset(self, *args, **kwargs):
+        self.env.reset()
+        if self.config["random_start"] > 0:
+            self.random_start(self.config["random_start"])
+        return (
+            {self.env.agent_selection: self.env.observe(self.env.agent_selection)},
+            {self.env.agent_selection: {}},
+        )
+
+    def step(self, action):
+        try:
+            self.env.step(action[self.env.agent_selection])
+        except (KeyError, IndexError):
+            self.env.step(action)
+        except AssertionError:
+            # Illegal action
+            print(action)
+            raise AssertionError("Illegal action")
+
+        obs_d = {}
+        rew_d = {}
+        done_d = {}
+        truncated_d = {}
+        info_d = {}
+        while self.env.agents:
+            obs, rew, done, trunc, info = self.env.last()
+            a = self.env.agent_selection
+            obs_d[a] = obs
+            rew_d[a] = rew
+            done_d[a] = done
+            truncated_d[a] = trunc
+            info_d[a] = info
+            if self.env.terminations[self.env.agent_selection]:
+                self.env.step(None)
+                done_d["__all__"] = True
+                truncated_d["__all__"] = True
+            else:
+                done_d["__all__"] = False
+                truncated_d["__all__"] = False
+                break
+
+        return obs_d, rew_d, done_d, truncated_d, info_d
+
+    def close(self):
+        self.env.close()
+
+    def seed(self, seed=None):
+        self.env.seed(seed)
+
+    def render(self, mode="human"):
+        return self.env.render(mode)
+
+    @property
+    def agent_selection(self):
+        return self.env.agent_selection
+
+    @property
+    def get_sub_environments(self):
+        return self.env.unwrapped
+
+    def get_state(self):
+        state = copy.deepcopy(self.env)
+        return state
+
+    def set_state(self, state):
+        self.env = copy.deepcopy(state)
+        return self.env.observe(self.env.agent_selection)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_connect4.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_connect4.py
new file mode 100644
index 0000000000000000000000000000000000000000..e87861b97cc1d2fad625a44be2d5d0a839b43b8c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_connect4.py
@@ -0,0 +1,213 @@
+import copy
+from typing import Dict, Any
+
+from pettingzoo import AECEnv
+from pettingzoo.classic.connect_four_v3 import raw_env as connect_four_v3
+
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+
+
+class MultiAgentConnect4(MultiAgentEnv):
+    """An interface to the PettingZoo MARL environment library.
+    See: https://github.com/Farama-Foundation/PettingZoo
+    Inherits from MultiAgentEnv and exposes a given AEC
+    (actor-environment-cycle) game from the PettingZoo project via the
+    MultiAgentEnv public API.
+    Note that the wrapper has some important limitations:
+    1. All agents have the same action_spaces and observation_spaces.
+       Note: If, within your aec game, agents do not have homogeneous action /
+       observation spaces, apply SuperSuit wrappers
+       to apply padding functionality: https://github.com/Farama-Foundation/
+       SuperSuit#built-in-multi-agent-only-functions
+    2. Environments are positive sum games (-> Agents are expected to cooperate
+       to maximize reward). This isn't a hard restriction, it just that
+       standard algorithms aren't expected to work well in highly competitive
+       games.
+
+    .. testcode::
+        :skipif: True
+
+        from pettingzoo.butterfly import prison_v3
+        from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
+        env = PettingZooEnv(prison_v3.env())
+        obs = env.reset()
+        print(obs)
+
+    .. testoutput::
+
+        # only returns the observation for the agent which should be stepping
+        {
+            'prisoner_0': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+
+    .. testcode::
+        :skipif: True
+
+        obs, rewards, dones, infos = env.step({
+                        "prisoner_0": 1
+                    })
+        # only returns the observation, reward, info, etc, for
+        # the agent who's turn is next.
+        print(obs)
+
+    .. testoutput::
+
+        {
+            'prisoner_1': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+
+    .. testcode::
+        :skipif: True
+
+        print(rewards)
+
+    .. testoutput::
+
+        {
+            'prisoner_1': 0
+        }
+
+    .. testcode::
+        :skipif: True
+
+        print(dones)
+
+    .. testoutput::
+
+        {
+            'prisoner_1': False, '__all__': False
+        }
+
+    .. testcode::
+        :skipif: True
+
+        print(infos)
+
+    .. testoutput::
+
+        {
+            'prisoner_1': {'map_tuple': (1, 0)}
+        }
+    """
+
+    def __init__(
+        self,
+        config: Dict[Any, Any] = None,
+        env: AECEnv = None,
+    ):
+        super().__init__()
+        if env is None:
+            self.env = connect_four_v3()
+        else:
+            self.env = env
+        self.env.reset()
+
+        self.config = config
+        # Get first observation space, assuming all agents have equal space
+        self.observation_space = self.env.observation_space(self.env.agents[0])
+
+        # Get first action space, assuming all agents have equal space
+        self.action_space = self.env.action_space(self.env.agents[0])
+
+        assert all(
+            self.env.observation_space(agent) == self.observation_space
+            for agent in self.env.agents
+        ), (
+            "Observation spaces for all agents must be identical. Perhaps "
+            "SuperSuit's pad_observations wrapper can help (useage: "
+            "`supersuit.aec_wrappers.pad_observations(env)`"
+        )
+
+        assert all(
+            self.env.action_space(agent) == self.action_space
+            for agent in self.env.agents
+        ), (
+            "Action spaces for all agents must be identical. Perhaps "
+            "SuperSuit's pad_action_space wrapper can help (usage: "
+            "`supersuit.aec_wrappers.pad_action_space(env)`"
+        )
+        self._agent_ids = set(self.env.agents)
+
+    def observe(self):
+        return {
+            self.env.agent_selection: self.env.observe(self.env.agent_selection),
+            "state": self.get_state(),
+        }
+
+    def reset(self, *args, **kwargs):
+        self.env.reset()
+        return (
+            {self.env.agent_selection: self.env.observe(self.env.agent_selection)},
+            {self.env.agent_selection: {}},
+        )
+
+    def step(self, action):
+        try:
+            self.env.step(action[self.env.agent_selection])
+        except (KeyError, IndexError):
+            self.env.step(action)
+        except AssertionError:
+            # Illegal action
+            print(action)
+            raise AssertionError("Illegal action")
+
+        obs_d = {}
+        rew_d = {}
+        done_d = {}
+        trunc_d = {}
+        info_d = {}
+        while self.env.agents:
+            obs, rew, done, trunc, info = self.env.last()
+            a = self.env.agent_selection
+            obs_d[a] = obs
+            rew_d[a] = rew
+            done_d[a] = done
+            trunc_d[a] = trunc
+            info_d[a] = info
+            if self.env.terminations[self.env.agent_selection]:
+                self.env.step(None)
+                done_d["__all__"] = True
+                trunc_d["__all__"] = True
+            else:
+                done_d["__all__"] = False
+                trunc_d["__all__"] = False
+                break
+
+        return obs_d, rew_d, done_d, trunc_d, info_d
+
+    def close(self):
+        self.env.close()
+
+    def seed(self, seed=None):
+        self.env.seed(seed)
+
+    def render(self, mode="human"):
+        return self.env.render(mode)
+
+    @property
+    def agent_selection(self):
+        return self.env.agent_selection
+
+    @property
+    def get_sub_environments(self):
+        return self.env.unwrapped
+
+    def get_state(self):
+        state = copy.deepcopy(self.env)
+        return state
+
+    def set_state(self, state):
+        self.env = copy.deepcopy(state)
+        return self.env.observe(self.env.agent_selection)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/rock_paper_scissors.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/rock_paper_scissors.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa363ae75f2bd79dcd3f97a888f9bb7f33c43771
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/rock_paper_scissors.py
@@ -0,0 +1,125 @@
+# __sphinx_doc_1_begin__
+import gymnasium as gym
+
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+
+
+class RockPaperScissors(MultiAgentEnv):
+    """Two-player environment for the famous rock paper scissors game.
+
+    # __sphinx_doc_1_end__
+    Optionally, the "Sheldon Cooper extension" can be activated by passing
+    `sheldon_cooper_mode=True` into the constructor, in which case two more moves
+    are allowed: Spock and Lizard. Spock is poisoned by Lizard, disproven by Paper, but
+    crushes Rock and smashes Scissors. Lizard poisons Spock and eats Paper, but is
+    decapitated by Scissors and crushed by Rock.
+
+    # __sphinx_doc_2_begin__
+    Both players always move simultaneously over a course of 10 timesteps in total.
+    The winner of each timestep receives reward of +1, the losing player -1.0.
+
+    The observation of each player is the last opponent action.
+    """
+
+    ROCK = 0
+    PAPER = 1
+    SCISSORS = 2
+    LIZARD = 3
+    SPOCK = 4
+
+    WIN_MATRIX = {
+        (ROCK, ROCK): (0, 0),
+        (ROCK, PAPER): (-1, 1),
+        (ROCK, SCISSORS): (1, -1),
+        (PAPER, ROCK): (1, -1),
+        (PAPER, PAPER): (0, 0),
+        (PAPER, SCISSORS): (-1, 1),
+        (SCISSORS, ROCK): (-1, 1),
+        (SCISSORS, PAPER): (1, -1),
+        (SCISSORS, SCISSORS): (0, 0),
+    }
+    # __sphinx_doc_2_end__
+
+    WIN_MATRIX.update(
+        {
+            # Sheldon Cooper mode:
+            (LIZARD, LIZARD): (0, 0),
+            (LIZARD, SPOCK): (1, -1),  # Lizard poisons Spock
+            (LIZARD, ROCK): (-1, 1),  # Rock crushes lizard
+            (LIZARD, PAPER): (1, -1),  # Lizard eats paper
+            (LIZARD, SCISSORS): (-1, 1),  # Scissors decapitate lizard
+            (ROCK, LIZARD): (1, -1),  # Rock crushes lizard
+            (PAPER, LIZARD): (-1, 1),  # Lizard eats paper
+            (SCISSORS, LIZARD): (1, -1),  # Scissors decapitate lizard
+            (SPOCK, SPOCK): (0, 0),
+            (SPOCK, LIZARD): (-1, 1),  # Lizard poisons Spock
+            (SPOCK, ROCK): (1, -1),  # Spock vaporizes rock
+            (SPOCK, PAPER): (-1, 1),  # Paper disproves Spock
+            (SPOCK, SCISSORS): (1, -1),  # Spock smashes scissors
+            (ROCK, SPOCK): (-1, 1),  # Spock vaporizes rock
+            (PAPER, SPOCK): (1, -1),  # Paper disproves Spock
+            (SCISSORS, SPOCK): (-1, 1),  # Spock smashes scissors
+        }
+    )
+
+    # __sphinx_doc_3_begin__
+    def __init__(self, config=None):
+        super().__init__()
+
+        self.agents = self.possible_agents = ["player1", "player2"]
+
+        # The observations are always the last taken actions. Hence observation- and
+        # action spaces are identical.
+        self.observation_spaces = self.action_spaces = {
+            "player1": gym.spaces.Discrete(3),
+            "player2": gym.spaces.Discrete(3),
+        }
+        self.last_move = None
+        self.num_moves = 0
+        # __sphinx_doc_3_end__
+
+        self.sheldon_cooper_mode = False
+        if config.get("sheldon_cooper_mode"):
+            self.sheldon_cooper_mode = True
+            self.action_spaces = self.observation_spaces = {
+                "player1": gym.spaces.Discrete(5),
+                "player2": gym.spaces.Discrete(5),
+            }
+
+    # __sphinx_doc_4_begin__
+    def reset(self, *, seed=None, options=None):
+        self.num_moves = 0
+
+        # The first observation should not matter (none of the agents has moved yet).
+        # Set them to 0.
+        return {
+            "player1": 0,
+            "player2": 0,
+        }, {}  # <- empty infos dict
+
+    # __sphinx_doc_4_end__
+
+    # __sphinx_doc_5_begin__
+    def step(self, action_dict):
+        self.num_moves += 1
+
+        move1 = action_dict["player1"]
+        move2 = action_dict["player2"]
+
+        # Set the next observations (simply use the other player's action).
+        # Note that because we are publishing both players in the observations dict,
+        # we expect both players to act in the next `step()` (simultaneous stepping).
+        observations = {"player1": move2, "player2": move1}
+
+        # Compute rewards for each player based on the win-matrix.
+        r1, r2 = self.WIN_MATRIX[move1, move2]
+        rewards = {"player1": r1, "player2": r2}
+
+        # Terminate the entire episode (for all agents) once 10 moves have been made.
+        terminateds = {"__all__": self.num_moves >= 10}
+
+        # Leave truncateds and infos empty.
+        return observations, rewards, terminateds, {}, {}
+
+
+# __sphinx_doc_5_end__
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/tic_tac_toe.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/tic_tac_toe.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceb08422092fdd7f19ab7e63d74f03f1615967a8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/tic_tac_toe.py
@@ -0,0 +1,144 @@
+# __sphinx_doc_1_begin__
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+
+
+class TicTacToe(MultiAgentEnv):
+    """A two-player game in which any player tries to complete one row in a 3x3 field.
+
+    The observation space is Box(0.0, 1.0, (9,)), where each index represents a distinct
+    field on a 3x3 board and values of 0.0 mean the field is empty, -1.0 means
+    the opponend owns the field, and 1.0 means we occupy the field:
+    ----------
+    | 0| 1| 2|
+    ----------
+    | 3| 4| 5|
+    ----------
+    | 6| 7| 8|
+    ----------
+
+    The action space is Discrete(9) and actions landing on an already occupied field
+    are simply ignored (and thus useless to the player taking these actions).
+
+    Once a player completes a row, they receive +1.0 reward, the losing player receives
+    -1.0 reward. In all other cases, both players receive 0.0 reward.
+    """
+
+    # __sphinx_doc_1_end__
+
+    # __sphinx_doc_2_begin__
+    def __init__(self, config=None):
+        super().__init__()
+
+        # Define the agents in the game.
+        self.agents = self.possible_agents = ["player1", "player2"]
+
+        # Each agent observes a 9D tensor, representing the 3x3 fields of the board.
+        # A 0 means an empty field, a 1 represents a piece of player 1, a -1 a piece of
+        # player 2.
+        self.observation_spaces = {
+            "player1": gym.spaces.Box(-1.0, 1.0, (9,), np.float32),
+            "player2": gym.spaces.Box(-1.0, 1.0, (9,), np.float32),
+        }
+        # Each player has 9 actions, encoding the 9 fields each player can place a piece
+        # on during their turn.
+        self.action_spaces = {
+            "player1": gym.spaces.Discrete(9),
+            "player2": gym.spaces.Discrete(9),
+        }
+
+        self.board = None
+        self.current_player = None
+
+    # __sphinx_doc_2_end__
+
+    # __sphinx_doc_3_begin__
+    def reset(self, *, seed=None, options=None):
+        self.board = [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ]
+        # Pick a random player to start the game.
+        self.current_player = np.random.choice(["player1", "player2"])
+        # Return observations dict (only with the starting player, which is the one
+        # we expect to act next).
+        return {
+            self.current_player: np.array(self.board, np.float32),
+        }, {}
+
+    # __sphinx_doc_3_end__
+
+    # __sphinx_doc_4_begin__
+    def step(self, action_dict):
+        action = action_dict[self.current_player]
+
+        # Create a rewards-dict (containing the rewards of the agent that just acted).
+        rewards = {self.current_player: 0.0}
+        # Create a terminateds-dict with the special `__all__` agent ID, indicating that
+        # if True, the episode ends for all agents.
+        terminateds = {"__all__": False}
+
+        opponent = "player1" if self.current_player == "player2" else "player2"
+
+        # Penalize trying to place a piece on an already occupied field.
+        if self.board[action] != 0:
+            rewards[self.current_player] -= 5.0
+        # Change the board according to the (valid) action taken.
+        else:
+            self.board[action] = 1 if self.current_player == "player1" else -1
+
+            # After having placed a new piece, figure out whether the current player
+            # won or not.
+            if self.current_player == "player1":
+                win_val = [1, 1, 1]
+            else:
+                win_val = [-1, -1, -1]
+            if (
+                # Horizontal win.
+                self.board[:3] == win_val
+                or self.board[3:6] == win_val
+                or self.board[6:] == win_val
+                # Vertical win.
+                or self.board[0:7:3] == win_val
+                or self.board[1:8:3] == win_val
+                or self.board[2:9:3] == win_val
+                # Diagonal win.
+                or self.board[::3] == win_val
+                or self.board[2:7:2] == win_val
+            ):
+                # Final reward is +5 for victory and -5 for a loss.
+                rewards[self.current_player] += 5.0
+                rewards[opponent] = -5.0
+
+                # Episode is done and needs to be reset for a new game.
+                terminateds["__all__"] = True
+
+            # The board might also be full w/o any player having won/lost.
+            # In this case, we simply end the episode and none of the players receives
+            # +1 or -1 reward.
+            elif 0 not in self.board:
+                terminateds["__all__"] = True
+
+        # Flip players and return an observations dict with only the next player to
+        # make a move in it.
+        self.current_player = opponent
+
+        return (
+            {self.current_player: np.array(self.board, np.float32)},
+            rewards,
+            terminateds,
+            {},
+            {},
+        )
+
+
+# __sphinx_doc_4_end__
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/two_step_game.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/two_step_game.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf606f7b405418c27cb3dab32f952e11562142d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/two_step_game.py
@@ -0,0 +1,123 @@
+from gymnasium.spaces import Dict, Discrete, MultiDiscrete, Tuple
+import numpy as np
+
+from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE
+
+
+class TwoStepGame(MultiAgentEnv):
+    action_space = Discrete(2)
+
+    def __init__(self, env_config):
+        super().__init__()
+        self.action_space = Discrete(2)
+        self.state = None
+        self.agent_1 = 0
+        self.agent_2 = 1
+        # MADDPG emits action logits instead of actual discrete actions
+        self.actions_are_logits = env_config.get("actions_are_logits", False)
+        self.one_hot_state_encoding = env_config.get("one_hot_state_encoding", False)
+        self.with_state = env_config.get("separate_state_space", False)
+        self._agent_ids = {0, 1}
+        if not self.one_hot_state_encoding:
+            self.observation_space = Discrete(6)
+            self.with_state = False
+        else:
+            # Each agent gets the full state (one-hot encoding of which of the
+            # three states are active) as input with the receiving agent's
+            # ID (1 or 2) concatenated onto the end.
+            if self.with_state:
+                self.observation_space = Dict(
+                    {
+                        "obs": MultiDiscrete([2, 2, 2, 3]),
+                        ENV_STATE: MultiDiscrete([2, 2, 2]),
+                    }
+                )
+            else:
+                self.observation_space = MultiDiscrete([2, 2, 2, 3])
+
+    def reset(self, *, seed=None, options=None):
+        if seed is not None:
+            np.random.seed(seed)
+        self.state = np.array([1, 0, 0])
+        return self._obs(), {}
+
+    def step(self, action_dict):
+        if self.actions_are_logits:
+            action_dict = {
+                k: np.random.choice([0, 1], p=v) for k, v in action_dict.items()
+            }
+
+        state_index = np.flatnonzero(self.state)
+        if state_index == 0:
+            action = action_dict[self.agent_1]
+            assert action in [0, 1], action
+            if action == 0:
+                self.state = np.array([0, 1, 0])
+            else:
+                self.state = np.array([0, 0, 1])
+            global_rew = 0
+            terminated = False
+        elif state_index == 1:
+            global_rew = 7
+            terminated = True
+        else:
+            if action_dict[self.agent_1] == 0 and action_dict[self.agent_2] == 0:
+                global_rew = 0
+            elif action_dict[self.agent_1] == 1 and action_dict[self.agent_2] == 1:
+                global_rew = 8
+            else:
+                global_rew = 1
+            terminated = True
+
+        rewards = {self.agent_1: global_rew / 2.0, self.agent_2: global_rew / 2.0}
+        obs = self._obs()
+        terminateds = {"__all__": terminated}
+        truncateds = {"__all__": False}
+        infos = {
+            self.agent_1: {"done": terminateds["__all__"]},
+            self.agent_2: {"done": terminateds["__all__"]},
+        }
+        return obs, rewards, terminateds, truncateds, infos
+
+    def _obs(self):
+        if self.with_state:
+            return {
+                self.agent_1: {"obs": self.agent_1_obs(), ENV_STATE: self.state},
+                self.agent_2: {"obs": self.agent_2_obs(), ENV_STATE: self.state},
+            }
+        else:
+            return {self.agent_1: self.agent_1_obs(), self.agent_2: self.agent_2_obs()}
+
+    def agent_1_obs(self):
+        if self.one_hot_state_encoding:
+            return np.concatenate([self.state, [1]])
+        else:
+            return np.flatnonzero(self.state)[0]
+
+    def agent_2_obs(self):
+        if self.one_hot_state_encoding:
+            return np.concatenate([self.state, [2]])
+        else:
+            return np.flatnonzero(self.state)[0] + 3
+
+
+class TwoStepGameWithGroupedAgents(MultiAgentEnv):
+    def __init__(self, env_config):
+        super().__init__()
+        env = TwoStepGame(env_config)
+        tuple_obs_space = Tuple([env.observation_space, env.observation_space])
+        tuple_act_space = Tuple([env.action_space, env.action_space])
+        self._agent_ids = {"agents"}
+        self.env = env.with_agent_groups(
+            groups={"agents": [0, 1]},
+            obs_space=tuple_obs_space,
+            act_space=tuple_act_space,
+        )
+        self.observation_space = Dict({"agents": self.env.observation_space})
+        self.action_space = Dict({"agents": self.env.action_space})
+
+    def reset(self, *, seed=None, options=None):
+        return self.env.reset(seed=seed, options=options)
+
+    def step(self, actions):
+        return self.env.step(actions)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/nested_space_repeat_after_me_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/nested_space_repeat_after_me_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..233f72f3610d629f905da1b8819b82065dc43296
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/nested_space_repeat_after_me_env.py
@@ -0,0 +1,50 @@
+import gymnasium as gym
+from gymnasium.spaces import Box, Dict, Discrete, Tuple
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.utils.spaces.space_utils import flatten_space
+
+
+class NestedSpaceRepeatAfterMeEnv(gym.Env):
+    """Env for which policy has to repeat the (possibly complex) observation.
+
+    The action space and observation spaces are always the same and may be
+    arbitrarily nested Dict/Tuple Spaces.
+    Rewards are given for exactly matching Discrete sub-actions and for being
+    as close as possible for Box sub-actions.
+    """
+
+    def __init__(self, config=None):
+        config = config or {}
+        self.observation_space = config.get(
+            "space", Tuple([Discrete(2), Dict({"a": Box(-1.0, 1.0, (2,))})])
+        )
+        self.action_space = self.observation_space
+        self.flattened_action_space = flatten_space(self.action_space)
+        self.episode_len = config.get("episode_len", 100)
+
+    def reset(self, *, seed=None, options=None):
+        self.steps = 0
+        return self._next_obs(), {}
+
+    def step(self, action):
+        self.steps += 1
+        action = tree.flatten(action)
+        reward = 0.0
+        for a, o, space in zip(
+            action, self.current_obs_flattened, self.flattened_action_space
+        ):
+            # Box: -abs(diff).
+            if isinstance(space, gym.spaces.Box):
+                reward -= np.sum(np.abs(a - o))
+            # Discrete: +1.0 if exact match.
+            if isinstance(space, gym.spaces.Discrete):
+                reward += 1.0 if a == o else 0.0
+        truncated = self.steps >= self.episode_len
+        return self._next_obs(), reward, False, truncated, {}
+
+    def _next_obs(self):
+        self.current_obs = self.observation_space.sample()
+        self.current_obs_flattened = tree.flatten(self.current_obs)
+        return self.current_obs
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/parametric_actions_cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/parametric_actions_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..94fb78f417b8f4caab84f569b2926f261250459b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/parametric_actions_cartpole.py
@@ -0,0 +1,145 @@
+import random
+
+import gymnasium as gym
+import numpy as np
+from gymnasium.spaces import Box, Dict, Discrete
+
+
+class ParametricActionsCartPole(gym.Env):
+    """Parametric action version of CartPole.
+
+    In this env there are only ever two valid actions, but we pretend there are
+    actually up to `max_avail_actions` actions that can be taken, and the two
+    valid actions are randomly hidden among this set.
+
+    At each step, we emit a dict of:
+        - the actual cart observation
+        - a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail)
+        - the list of action embeddings (w/ zeroes for invalid actions) (e.g.,
+            [[0, 0],
+             [0, 0],
+             [-0.2322, -0.2569],
+             [0, 0],
+             [0, 0],
+             [0.7878, 1.2297]] for max_avail_actions=6)
+
+    In a real environment, the actions embeddings would be larger than two
+    units of course, and also there would be a variable number of valid actions
+    per step instead of always [LEFT, RIGHT].
+    """
+
+    def __init__(self, max_avail_actions):
+        # Use simple random 2-unit action embeddings for [LEFT, RIGHT]
+        self.left_action_embed = np.random.randn(2)
+        self.right_action_embed = np.random.randn(2)
+        self.action_space = Discrete(max_avail_actions)
+        self.wrapped = gym.make("CartPole-v1")
+        self.observation_space = Dict(
+            {
+                "action_mask": Box(0, 1, shape=(max_avail_actions,), dtype=np.int8),
+                "avail_actions": Box(-10, 10, shape=(max_avail_actions, 2)),
+                "cart": self.wrapped.observation_space,
+            }
+        )
+
+    def update_avail_actions(self):
+        self.action_assignments = np.array(
+            [[0.0, 0.0]] * self.action_space.n, dtype=np.float32
+        )
+        self.action_mask = np.array([0.0] * self.action_space.n, dtype=np.int8)
+        self.left_idx, self.right_idx = random.sample(range(self.action_space.n), 2)
+        self.action_assignments[self.left_idx] = self.left_action_embed
+        self.action_assignments[self.right_idx] = self.right_action_embed
+        self.action_mask[self.left_idx] = 1
+        self.action_mask[self.right_idx] = 1
+
+    def reset(self, *, seed=None, options=None):
+        self.update_avail_actions()
+        obs, infos = self.wrapped.reset()
+        return {
+            "action_mask": self.action_mask,
+            "avail_actions": self.action_assignments,
+            "cart": obs,
+        }, infos
+
+    def step(self, action):
+        if action == self.left_idx:
+            actual_action = 0
+        elif action == self.right_idx:
+            actual_action = 1
+        else:
+            raise ValueError(
+                "Chosen action was not one of the non-zero action embeddings",
+                action,
+                self.action_assignments,
+                self.action_mask,
+                self.left_idx,
+                self.right_idx,
+            )
+        orig_obs, rew, done, truncated, info = self.wrapped.step(actual_action)
+        self.update_avail_actions()
+        self.action_mask = self.action_mask.astype(np.int8)
+        obs = {
+            "action_mask": self.action_mask,
+            "avail_actions": self.action_assignments,
+            "cart": orig_obs,
+        }
+        return obs, rew, done, truncated, info
+
+
+class ParametricActionsCartPoleNoEmbeddings(gym.Env):
+    """Same as the above ParametricActionsCartPole.
+
+    However, action embeddings are not published inside observations,
+    but will be learnt by the model.
+
+    At each step, we emit a dict of:
+        - the actual cart observation
+        - a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail)
+        - action embeddings (w/ "dummy embedding" for invalid actions) are
+          outsourced in the model and will be learned.
+    """
+
+    def __init__(self, max_avail_actions):
+        # Randomly set which two actions are valid and available.
+        self.left_idx, self.right_idx = random.sample(range(max_avail_actions), 2)
+        self.valid_avail_actions_mask = np.array(
+            [0.0] * max_avail_actions, dtype=np.int8
+        )
+        self.valid_avail_actions_mask[self.left_idx] = 1
+        self.valid_avail_actions_mask[self.right_idx] = 1
+        self.action_space = Discrete(max_avail_actions)
+        self.wrapped = gym.make("CartPole-v1")
+        self.observation_space = Dict(
+            {
+                "valid_avail_actions_mask": Box(0, 1, shape=(max_avail_actions,)),
+                "cart": self.wrapped.observation_space,
+            }
+        )
+
+    def reset(self, *, seed=None, options=None):
+        obs, infos = self.wrapped.reset()
+        return {
+            "valid_avail_actions_mask": self.valid_avail_actions_mask,
+            "cart": obs,
+        }, infos
+
+    def step(self, action):
+        if action == self.left_idx:
+            actual_action = 0
+        elif action == self.right_idx:
+            actual_action = 1
+        else:
+            raise ValueError(
+                "Chosen action was not one of the non-zero action embeddings",
+                action,
+                self.valid_avail_actions_mask,
+                self.left_idx,
+                self.right_idx,
+            )
+        orig_obs, rew, done, truncated, info = self.wrapped.step(actual_action)
+        obs = {
+            "valid_avail_actions_mask": self.valid_avail_actions_mask,
+            "cart": orig_obs,
+        }
+        return obs, rew, done, truncated, info
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/random_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/random_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f413a597c9abc17180aa5900bd49f5077f45d93
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/random_env.py
@@ -0,0 +1,125 @@
+import copy
+import gymnasium as gym
+from gymnasium.spaces import Discrete, Tuple
+import numpy as np
+
+from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent
+
+
+class RandomEnv(gym.Env):
+    """A randomly acting environment.
+
+    Can be instantiated with arbitrary action-, observation-, and reward
+    spaces. Observations and rewards are generated by simply sampling from the
+    observation/reward spaces. The probability of a `terminated=True` after each
+    action can be configured, as well as the max episode length.
+    """
+
+    def __init__(self, config=None):
+        config = config or {}
+
+        # Action space.
+        self.action_space = config.get("action_space", Discrete(2))
+        # Observation space from which to sample.
+        self.observation_space = config.get("observation_space", Discrete(2))
+        # Reward space from which to sample.
+        self.reward_space = config.get(
+            "reward_space",
+            gym.spaces.Box(low=-1.0, high=1.0, shape=(), dtype=np.float32),
+        )
+        self.static_samples = config.get("static_samples", False)
+        if self.static_samples:
+            self.observation_sample = self.observation_space.sample()
+            self.reward_sample = self.reward_space.sample()
+
+        # Chance that an episode ends at any step.
+        # Note that a max episode length can be specified via
+        # `max_episode_len`.
+        self.p_terminated = config.get("p_terminated")
+        if self.p_terminated is None:
+            self.p_terminated = config.get("p_done", 0.1)
+        # A max episode length. Even if the `p_terminated` sampling does not lead
+        # to a terminus, the episode will end after at most this many
+        # timesteps.
+        # Set to 0 or None for using no limit on the episode length.
+        self.max_episode_len = config.get("max_episode_len", None)
+        # Whether to check action bounds.
+        self.check_action_bounds = config.get("check_action_bounds", False)
+        # Steps taken so far (after last reset).
+        self.steps = 0
+
+    def reset(self, *, seed=None, options=None):
+        self.steps = 0
+        if not self.static_samples:
+            return self.observation_space.sample(), {}
+        else:
+            return copy.deepcopy(self.observation_sample), {}
+
+    def step(self, action):
+        if self.check_action_bounds and not self.action_space.contains(action):
+            raise ValueError(
+                "Illegal action for {}: {}".format(self.action_space, action)
+            )
+        if isinstance(self.action_space, Tuple) and len(action) != len(
+            self.action_space.spaces
+        ):
+            raise ValueError(
+                "Illegal action for {}: {}".format(self.action_space, action)
+            )
+
+        self.steps += 1
+        terminated = False
+        truncated = False
+        # We are `truncated` as per our max-episode-len.
+        if self.max_episode_len and self.steps >= self.max_episode_len:
+            truncated = True
+        # Max episode length not reached yet -> Sample `terminated` via `p_terminated`.
+        elif self.p_terminated > 0.0:
+            terminated = bool(
+                np.random.choice(
+                    [True, False], p=[self.p_terminated, 1.0 - self.p_terminated]
+                )
+            )
+
+        if not self.static_samples:
+            return (
+                self.observation_space.sample(),
+                self.reward_space.sample(),
+                terminated,
+                truncated,
+                {},
+            )
+        else:
+            return (
+                copy.deepcopy(self.observation_sample),
+                copy.deepcopy(self.reward_sample),
+                terminated,
+                truncated,
+                {},
+            )
+
+
+# Multi-agent version of the RandomEnv.
+RandomMultiAgentEnv = make_multi_agent(lambda c: RandomEnv(c))
+
+
+# Large observation space "pre-compiled" random env (for testing).
+class RandomLargeObsSpaceEnv(RandomEnv):
+    def __init__(self, config=None):
+        config = config or {}
+        config.update({"observation_space": gym.spaces.Box(-1.0, 1.0, (5000,))})
+        super().__init__(config=config)
+
+
+# Large observation space + cont. actions "pre-compiled" random env
+# (for testing).
+class RandomLargeObsSpaceEnvContActions(RandomEnv):
+    def __init__(self, config=None):
+        config = config or {}
+        config.update(
+            {
+                "observation_space": gym.spaces.Box(-1.0, 1.0, (5000,)),
+                "action_space": gym.spaces.Box(-1.0, 1.0, (5,)),
+            }
+        )
+        super().__init__(config=config)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/recommender_system_envs_with_recsim.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/recommender_system_envs_with_recsim.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2f7a28e4b39ca6587dffdd89a065ce9226187ea
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/recommender_system_envs_with_recsim.py
@@ -0,0 +1,108 @@
+"""Examples for RecSim envs ready to be used by RLlib Algorithms.
+
+RecSim is a configurable recommender systems simulation platform.
+Source: https://github.com/google-research/recsim
+"""
+
+from recsim import choice_model
+from recsim.environments import (
+    long_term_satisfaction as lts,
+    interest_evolution as iev,
+    interest_exploration as iex,
+)
+
+from ray.rllib.env.wrappers.recsim import make_recsim_env
+from ray.tune import register_env
+
+# Some built-in RecSim envs to test with.
+# ---------------------------------------
+
+# Long-term satisfaction env: User has to pick from items that are either
+# a) unhealthy, but taste good, or b) healthy, but have bad taste.
+# Best strategy is to pick a mix of both to ensure long-term
+# engagement.
+
+
+def lts_user_model_creator(env_ctx):
+    return lts.LTSUserModel(
+        env_ctx["slate_size"],
+        user_state_ctor=lts.LTSUserState,
+        response_model_ctor=lts.LTSResponse,
+    )
+
+
+def lts_document_sampler_creator(env_ctx):
+    return lts.LTSDocumentSampler()
+
+
+LongTermSatisfactionRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=lts_user_model_creator,
+    recsim_document_sampler_creator=lts_document_sampler_creator,
+    reward_aggregator=lts.clicked_engagement_reward,
+)
+
+
+# Interest exploration env: Models the problem of active exploration
+# of user interests. It is meant to illustrate popularity bias in
+# recommender systems, where myopic maximization of engagement leads
+# to bias towards documents that have wider appeal,
+# whereas niche user interests remain unexplored.
+def iex_user_model_creator(env_ctx):
+    return iex.IEUserModel(
+        env_ctx["slate_size"],
+        user_state_ctor=iex.IEUserState,
+        response_model_ctor=iex.IEResponse,
+        seed=env_ctx["seed"],
+    )
+
+
+def iex_document_sampler_creator(env_ctx):
+    return iex.IETopicDocumentSampler(seed=env_ctx["seed"])
+
+
+InterestExplorationRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=iex_user_model_creator,
+    recsim_document_sampler_creator=iex_document_sampler_creator,
+    reward_aggregator=iex.total_clicks_reward,
+)
+
+
+# Interest evolution env: See https://github.com/google-research/recsim
+# for more information.
+def iev_user_model_creator(env_ctx):
+    return iev.IEvUserModel(
+        env_ctx["slate_size"],
+        choice_model_ctor=choice_model.MultinomialProportionalChoiceModel,
+        response_model_ctor=iev.IEvResponse,
+        user_state_ctor=iev.IEvUserState,
+        seed=env_ctx["seed"],
+    )
+
+
+# Extend IEvVideo to fix a bug caused by None cluster_ids.
+class SingleClusterIEvVideo(iev.IEvVideo):
+    def __init__(self, doc_id, features, video_length=None, quality=None):
+        super(SingleClusterIEvVideo, self).__init__(
+            doc_id=doc_id,
+            features=features,
+            cluster_id=0,  # single cluster.
+            video_length=video_length,
+            quality=quality,
+        )
+
+
+def iev_document_sampler_creator(env_ctx):
+    return iev.UtilityModelVideoSampler(doc_ctor=iev.IEvVideo, seed=env_ctx["seed"])
+
+
+InterestEvolutionRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=iev_user_model_creator,
+    recsim_document_sampler_creator=iev_document_sampler_creator,
+    reward_aggregator=iev.clicked_watchtime_reward,
+)
+
+
+# Backward compatibility.
+register_env(
+    name="RecSim-v1", env_creator=lambda env_ctx: InterestEvolutionRecSimEnv(env_ctx)
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_after_me_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_after_me_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a87f60ac6c55c6783205e3b9e9ce7392b5f9396
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_after_me_env.py
@@ -0,0 +1,47 @@
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+
+
+class RepeatAfterMeEnv(gym.Env):
+    """Env in which the observation at timestep minus n must be repeated."""
+
+    def __init__(self, config=None):
+        config = config or {}
+        if config.get("continuous"):
+            self.observation_space = Box(-1.0, 1.0, (2,))
+        else:
+            self.observation_space = Discrete(2)
+
+        self.action_space = self.observation_space
+        # Note: Set `repeat_delay` to 0 for simply repeating the seen
+        # observation (no delay).
+        self.delay = config.get("repeat_delay", 1)
+        self.episode_len = config.get("episode_len", 100)
+        self.history = []
+
+    def reset(self, *, seed=None, options=None):
+        self.history = [0] * self.delay
+        return self._next_obs(), {}
+
+    def step(self, action):
+        obs = self.history[-(1 + self.delay)]
+
+        reward = 0.0
+        # Box: -abs(diff).
+        if isinstance(self.action_space, Box):
+            reward = -np.sum(np.abs(action - obs))
+        # Discrete: +1.0 if exact match, -1.0 otherwise.
+        if isinstance(self.action_space, Discrete):
+            reward = 1.0 if action == obs else -1.0
+
+        done = truncated = len(self.history) > self.episode_len
+        return self._next_obs(), reward, done, truncated, {}
+
+    def _next_obs(self):
+        if isinstance(self.observation_space, Box):
+            token = np.random.random(size=(2,))
+        else:
+            token = np.random.choice([0, 1])
+        self.history.append(token)
+        return token
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_initial_obs_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_initial_obs_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1d43c560424caaa8fac5b433f536a72d5f4adc5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_initial_obs_env.py
@@ -0,0 +1,32 @@
+import gymnasium as gym
+from gymnasium.spaces import Discrete
+import random
+
+
+class RepeatInitialObsEnv(gym.Env):
+    """Env in which the initial observation has to be repeated all the time.
+
+    Runs for n steps.
+    r=1 if action correct, -1 otherwise (max. R=100).
+    """
+
+    def __init__(self, episode_len=100):
+        self.observation_space = Discrete(2)
+        self.action_space = Discrete(2)
+        self.token = None
+        self.episode_len = episode_len
+        self.num_steps = 0
+
+    def reset(self, *, seed=None, options=None):
+        self.token = random.choice([0, 1])
+        self.num_steps = 0
+        return self.token, {}
+
+    def step(self, action):
+        if action == self.token:
+            reward = 1
+        else:
+            reward = -1
+        self.num_steps += 1
+        done = truncated = self.num_steps >= self.episode_len
+        return 0, reward, done, truncated, {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_corridor.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_corridor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9088f73dbd374da7f7d1312e6ed68c1d5c25444e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_corridor.py
@@ -0,0 +1,42 @@
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+
+
+class SimpleCorridor(gym.Env):
+    """Example of a custom env in which you have to walk down a corridor.
+
+    You can configure the length of the corridor via the env config."""
+
+    def __init__(self, config=None):
+        config = config or {}
+
+        self.action_space = Discrete(2)
+        self.observation_space = Box(0.0, 999.0, shape=(1,), dtype=np.float32)
+
+        self.set_corridor_length(config.get("corridor_length", 10))
+
+        self._cur_pos = 0
+
+    def set_corridor_length(self, length):
+        self.end_pos = length
+        print(f"Set corridor length to {self.end_pos}")
+        assert self.end_pos <= 999, "The maximum `corridor_length` allowed is 999!"
+
+    def reset(self, *, seed=None, options=None):
+        self._cur_pos = 0.0
+        return self._get_obs(), {}
+
+    def step(self, action):
+        assert action in [0, 1], action
+        if action == 0 and self._cur_pos > 0:
+            self._cur_pos -= 1.0
+        elif action == 1:
+            self._cur_pos += 1.0
+        terminated = self._cur_pos >= self.end_pos
+        truncated = False
+        reward = 1.0 if terminated else -0.01
+        return self._get_obs(), reward, terminated, truncated, {}
+
+    def _get_obs(self):
+        return np.array([self._cur_pos], np.float32)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_rpg.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_rpg.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de7390bd96dd72e00c3871885ecc600a359e0b5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_rpg.py
@@ -0,0 +1,49 @@
+import gymnasium as gym
+from gymnasium.spaces import Discrete, Box, Dict
+
+from ray.rllib.utils.spaces.repeated import Repeated
+
+# Constraints on the Repeated space.
+MAX_PLAYERS = 4
+MAX_ITEMS = 7
+MAX_EFFECTS = 2
+
+
+class SimpleRPG(gym.Env):
+    """Example of a custom env with a complex, structured observation.
+
+    The observation is a list of players, each of which is a Dict of
+    attributes, and may further hold a list of items (categorical space).
+
+    Note that the env doesn't train, it's just a dummy example to show how to
+    use spaces.Repeated in a custom model (see CustomRPGModel below).
+    """
+
+    def __init__(self, config):
+        self.cur_pos = 0
+        self.action_space = Discrete(4)
+
+        # Represents an item.
+        self.item_space = Discrete(5)
+
+        # Represents an effect on the player.
+        self.effect_space = Box(9000, 9999, shape=(4,))
+
+        # Represents a player.
+        self.player_space = Dict(
+            {
+                "location": Box(-100, 100, shape=(2,)),
+                "status": Box(-1, 1, shape=(10,)),
+                "items": Repeated(self.item_space, max_len=MAX_ITEMS),
+                "effects": Repeated(self.effect_space, max_len=MAX_EFFECTS),
+            }
+        )
+
+        # Observation is a list of players.
+        self.observation_space = Repeated(self.player_space, max_len=MAX_PLAYERS)
+
+    def reset(self, *, seed=None, options=None):
+        return self.observation_space.sample(), {}
+
+    def step(self, action):
+        return self.observation_space.sample(), 1, True, False, {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/six_room_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/six_room_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a4b1a2a41d51084e44e6b6e267ddb309bc1c22d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/six_room_env.py
@@ -0,0 +1,315 @@
+import gymnasium as gym
+
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+
+
+# Map representation: Always six rooms (as the name suggests) with doors in between.
+MAPS = {
+    "small": [
+        "WWWWWWWWWWWWW",
+        "W   W   W   W",
+        "W       W   W",
+        "W   W       W",
+        "W WWWW WWWW W",
+        "W   W   W   W",
+        "W   W       W",
+        "W       W  GW",
+        "WWWWWWWWWWWWW",
+    ],
+    "medium": [
+        "WWWWWWWWWWWWWWWWWWW",
+        "W     W     W     W",
+        "W           W     W",
+        "W     W           W",
+        "W WWWWWWW WWWWWWW W",
+        "W     W     W     W",
+        "W     W           W",
+        "W           W    GW",
+        "WWWWWWWWWWWWWWWWWWW",
+    ],
+    "large": [
+        "WWWWWWWWWWWWWWWWWWWWWWWWW",
+        "W       W       W       W",
+        "W       W       W       W",
+        "W               W       W",
+        "W       W               W",
+        "W       W       W       W",
+        "WW WWWWWWWWW WWWWWWWWWW W",
+        "W       W       W       W",
+        "W       W               W",
+        "W       W       W       W",
+        "W               W       W",
+        "W       W       W      GW",
+        "WWWWWWWWWWWWWWWWWWWWWWWWW",
+    ],
+}
+
+
+class SixRoomEnv(gym.Env):
+    """A grid-world with six rooms (arranged as 2x3), which are connected by doors.
+
+    The agent starts in the upper left room and has to reach a designated goal state
+    in one of the rooms using primitive actions up, left, down, and right.
+
+    The agent receives a small penalty of -0.01 on each step and a reward of +10.0 when
+    reaching the goal state.
+    """
+
+    def __init__(self, config=None):
+        super().__init__()
+
+        # User can provide a custom map or a recognized map name (small, medium, large).
+        self.map = config.get("custom_map", MAPS.get(config.get("map"), MAPS["small"]))
+        self.time_limit = config.get("time_limit", 50)
+
+        # Define observation space: Discrete, index fields.
+        self.observation_space = gym.spaces.Discrete(len(self.map) * len(self.map[0]))
+        # Primitive actions: up, down, left, right.
+        self.action_space = gym.spaces.Discrete(4)
+
+        # Initialize environment state.
+        self.reset()
+
+    def reset(self, *, seed=None, options=None):
+        self._agent_pos = (1, 1)
+        self._ts = 0
+        # Return high-level observation.
+        return self._agent_discrete_pos, {}
+
+    def step(self, action):
+        next_pos = _get_next_pos(action, self._agent_pos)
+
+        self._ts += 1
+
+        # Check if the move ends up in a wall. If so -> Ignore the move and stay
+        # where we are right now.
+        if self.map[next_pos[0]][next_pos[1]] != "W":
+            self._agent_pos = next_pos
+
+        # Check if the agent has reached the global goal state.
+        if self.map[self._agent_pos[0]][self._agent_pos[1]] == "G":
+            return self._agent_discrete_pos, 10.0, True, False, {}
+
+        # Small step penalty.
+        return self._agent_discrete_pos, -0.01, False, self._ts >= self.time_limit, {}
+
+    @property
+    def _agent_discrete_pos(self):
+        x = self._agent_pos[0]
+        y = self._agent_pos[1]
+        # discrete position = row idx * columns + col idx
+        return x * len(self.map[0]) + y
+
+
+class HierarchicalSixRoomEnv(MultiAgentEnv):
+    def __init__(self, config=None):
+        super().__init__()
+
+        # User can provide a custom map or a recognized map name (small, medium, large).
+        self.map = config.get("custom_map", MAPS.get(config.get("map"), MAPS["small"]))
+        self.max_steps_low_level = config.get("max_steps_low_level", 15)
+        self.time_limit = config.get("time_limit", 50)
+        self.num_low_level_agents = config.get("num_low_level_agents", 3)
+
+        self.agents = self.possible_agents = ["high_level_agent"] + [
+            f"low_level_agent_{i}" for i in range(self.num_low_level_agents)
+        ]
+
+        # Define basic observation space: Discrete, index fields.
+        observation_space = gym.spaces.Discrete(len(self.map) * len(self.map[0]))
+        # Low level agents always see where they are right now and what the target
+        # state should be.
+        low_level_observation_space = gym.spaces.Tuple(
+            (observation_space, observation_space)
+        )
+        # Primitive actions: up, down, left, right.
+        low_level_action_space = gym.spaces.Discrete(4)
+
+        self.observation_spaces = {"high_level_agent": observation_space}
+        self.observation_spaces.update(
+            {
+                f"low_level_agent_{i}": low_level_observation_space
+                for i in range(self.num_low_level_agents)
+            }
+        )
+        self.action_spaces = {
+            "high_level_agent": gym.spaces.Tuple(
+                (
+                    # The new target observation.
+                    observation_space,
+                    # Low-level policy that should get us to the new target observation.
+                    gym.spaces.Discrete(self.num_low_level_agents),
+                )
+            )
+        }
+        self.action_spaces.update(
+            {
+                f"low_level_agent_{i}": low_level_action_space
+                for i in range(self.num_low_level_agents)
+            }
+        )
+
+        # Initialize environment state.
+        self.reset()
+
+    def reset(self, *, seed=None, options=None):
+        self._agent_pos = (1, 1)
+        self._low_level_steps = 0
+        self._high_level_action = None
+        # Number of times the low-level agent reached the given target (by the high
+        # level agent).
+        self._num_targets_reached = 0
+
+        self._ts = 0
+
+        # Return high-level observation.
+        return {
+            "high_level_agent": self._agent_discrete_pos,
+        }, {}
+
+    def step(self, action_dict):
+        self._ts += 1
+
+        terminateds = {"__all__": self._ts >= self.time_limit}
+        truncateds = {"__all__": False}
+
+        # High-level agent acted: Set next goal and next low-level policy to use.
+        # Note that the agent does not move in this case and stays at its current
+        # location.
+        if "high_level_agent" in action_dict:
+            self._high_level_action = action_dict["high_level_agent"]
+            low_level_agent = f"low_level_agent_{self._high_level_action[1]}"
+            self._low_level_steps = 0
+            # Return next low-level observation for the now-active agent.
+            # We want this agent to act next.
+            return (
+                {
+                    low_level_agent: (
+                        self._agent_discrete_pos,  # current
+                        self._high_level_action[0],  # target
+                    )
+                },
+                # Penalty for a target state that's close to the current state.
+                {
+                    "high_level_agent": (
+                        self.eucl_dist(
+                            self._agent_discrete_pos,
+                            self._high_level_action[0],
+                            self.map,
+                        )
+                        / (len(self.map) ** 2 + len(self.map[0]) ** 2) ** 0.5
+                    )
+                    - 1.0,
+                },
+                terminateds,
+                truncateds,
+                {},
+            )
+        # Low-level agent made a move (primitive action).
+        else:
+            assert len(action_dict) == 1
+
+            # Increment low-level step counter.
+            self._low_level_steps += 1
+
+            target_discrete_pos, low_level_agent = self._high_level_action
+            low_level_agent = f"low_level_agent_{low_level_agent}"
+            next_pos = _get_next_pos(action_dict[low_level_agent], self._agent_pos)
+
+            # Check if the move ends up in a wall. If so -> Ignore the move and stay
+            # where we are right now.
+            if self.map[next_pos[0]][next_pos[1]] != "W":
+                self._agent_pos = next_pos
+
+            # Check if the agent has reached the global goal state.
+            if self.map[self._agent_pos[0]][self._agent_pos[1]] == "G":
+                rewards = {
+                    "high_level_agent": 10.0,
+                    # +1.0 if the goal position was also the target position for the
+                    # low level agent.
+                    low_level_agent: float(
+                        self._agent_discrete_pos == target_discrete_pos
+                    ),
+                }
+                terminateds["__all__"] = True
+                return (
+                    {"high_level_agent": self._agent_discrete_pos},
+                    rewards,
+                    terminateds,
+                    truncateds,
+                    {},
+                )
+
+            # Low-level agent has reached its target location (given by the high-level):
+            # - Hand back control to high-level agent.
+            # - Reward low level agent and high-level agent with small rewards.
+            elif self._agent_discrete_pos == target_discrete_pos:
+                self._num_targets_reached += 1
+                rewards = {
+                    "high_level_agent": 1.0,
+                    low_level_agent: 1.0,
+                }
+                return (
+                    {"high_level_agent": self._agent_discrete_pos},
+                    rewards,
+                    terminateds,
+                    truncateds,
+                    {},
+                )
+
+            # Low-level agent has not reached anything.
+            else:
+                # Small step penalty for low-level agent.
+                rewards = {low_level_agent: -0.01}
+                # Reached time budget -> Hand back control to high level agent.
+                if self._low_level_steps >= self.max_steps_low_level:
+                    rewards["high_level_agent"] = -0.01
+                    return (
+                        {"high_level_agent": self._agent_discrete_pos},
+                        rewards,
+                        terminateds,
+                        truncateds,
+                        {},
+                    )
+                else:
+                    return (
+                        {
+                            low_level_agent: (
+                                self._agent_discrete_pos,  # current
+                                target_discrete_pos,  # target
+                            ),
+                        },
+                        rewards,
+                        terminateds,
+                        truncateds,
+                        {},
+                    )
+
+    @property
+    def _agent_discrete_pos(self):
+        x = self._agent_pos[0]
+        y = self._agent_pos[1]
+        # discrete position = row idx * columns + col idx
+        return x * len(self.map[0]) + y
+
+    @staticmethod
+    def eucl_dist(pos1, pos2, map):
+        x1, y1 = pos1 % len(map[0]), pos1 // len(map)
+        x2, y2 = pos2 % len(map[0]), pos2 // len(map)
+        return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
+
+
+def _get_next_pos(action, pos):
+    x, y = pos
+    # Up.
+    if action == 0:
+        return x - 1, y
+    # Down.
+    elif action == 1:
+        return x + 1, y
+    # Left.
+    elif action == 2:
+        return x, y - 1
+    # Right.
+    else:
+        return x, y + 1
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..cacc95bd7057120324e3c5460884db7028b6306c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_cartpole.py
@@ -0,0 +1,39 @@
+from gymnasium.spaces import Box
+import numpy as np
+
+from gymnasium.envs.classic_control import CartPoleEnv
+
+
+class StatelessCartPole(CartPoleEnv):
+    """Partially observable variant of the CartPole gym environment.
+
+    https://github.com/openai/gym/blob/master/gym/envs/classic_control/
+    cartpole.py
+
+    We delete the x- and angular velocity components of the state, so that it
+    can only be solved by a memory enhanced model (policy).
+    """
+
+    def __init__(self, config=None):
+        super().__init__()
+
+        # Fix our observation-space (remove 2 velocity components).
+        high = np.array(
+            [
+                self.x_threshold * 2,
+                self.theta_threshold_radians * 2,
+            ],
+            dtype=np.float32,
+        )
+
+        self.observation_space = Box(low=-high, high=high, dtype=np.float32)
+
+    def step(self, action):
+        next_obs, reward, done, truncated, info = super().step(action)
+        # next_obs is [x-pos, x-veloc, angle, angle-veloc]
+        return np.array([next_obs[0], next_obs[2]]), reward, done, truncated, info
+
+    def reset(self, *, seed=None, options=None):
+        init_obs, init_info = super().reset(seed=seed, options=options)
+        # init_obs is [x-pos, x-veloc, angle, angle-veloc]
+        return np.array([init_obs[0], init_obs[2]]), init_info
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_pendulum.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_pendulum.py
new file mode 100644
index 0000000000000000000000000000000000000000..36c6018229a5b510baeb9ea11bf0f68b1a7946ac
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_pendulum.py
@@ -0,0 +1,35 @@
+from gymnasium.spaces import Box
+import numpy as np
+
+from gymnasium.envs.classic_control import PendulumEnv
+
+
+class StatelessPendulum(PendulumEnv):
+    """Partially observable variant of the Pendulum gym environment.
+
+    https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/
+    classic_control/pendulum.py
+
+    We delete the angular velocity component of the state, so that it
+    can only be solved by a memory enhanced model (policy).
+    """
+
+    def __init__(self, config=None):
+        config = config or {}
+        g = config.get("g", 10.0)
+
+        super().__init__(g=g)
+
+        # Fix our observation-space (remove angular velocity component).
+        high = np.array([1.0, 1.0], dtype=np.float32)
+        self.observation_space = Box(low=-high, high=high, dtype=np.float32)
+
+    def step(self, action):
+        next_obs, reward, done, truncated, info = super().step(action)
+        # next_obs is [cos(theta), sin(theta), theta-dot (angular velocity)]
+        return next_obs[:-1], reward, done, truncated, info
+
+    def reset(self, *, seed=None, options=None):
+        init_obs, init_info = super().reset(seed=seed, options=options)
+        # init_obs is [cos(theta), sin(theta), theta-dot (angular velocity)]
+        return init_obs[:-1], init_info
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/transformed_action_space_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/transformed_action_space_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dce1051cbf30861fb196e6c8fbc0cf1522c871a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/transformed_action_space_env.py
@@ -0,0 +1,61 @@
+import gymnasium as gym
+from typing import Type
+
+
+class ActionTransform(gym.ActionWrapper):
+    def __init__(self, env, low, high):
+        super().__init__(env)
+        self._low = low
+        self._high = high
+        self.action_space = type(env.action_space)(
+            self._low, self._high, env.action_space.shape, env.action_space.dtype
+        )
+
+    def action(self, action):
+        return (action - self._low) / (self._high - self._low) * (
+            self.env.action_space.high - self.env.action_space.low
+        ) + self.env.action_space.low
+
+
+def transform_action_space(env_name_or_creator) -> Type[gym.Env]:
+    """Wrapper for gym.Envs to have their action space transformed.
+
+    Args:
+        env_name_or_creator (Union[str, Callable[]]: String specifier or
+            env_maker function.
+
+    Returns:
+        New transformed_action_space_env function that returns an environment
+        wrapped by the ActionTransform wrapper. The constructor takes a
+        config dict with `_low` and `_high` keys specifying the new action
+        range (default -1.0 to 1.0). The reset of the config dict will be
+        passed on to the underlying/wrapped env's constructor.
+
+    .. testcode::
+        :skipif: True
+
+        # By gym string:
+        pendulum_300_to_500_cls = transform_action_space("Pendulum-v1")
+        # Create a transformed pendulum env.
+        pendulum_300_to_500 = pendulum_300_to_500_cls({"_low": -15.0})
+        pendulum_300_to_500.action_space
+
+    .. testoutput::
+
+        gym.spaces.Box(-15.0, 1.0, (1, ), "float32")
+    """
+
+    def transformed_action_space_env(config):
+        if isinstance(env_name_or_creator, str):
+            inner_env = gym.make(env_name_or_creator)
+        else:
+            inner_env = env_name_or_creator(config)
+        _low = config.pop("low", -1.0)
+        _high = config.pop("high", 1.0)
+        env = ActionTransform(inner_env, _low, _high)
+        return env
+
+    return transformed_action_space_env
+
+
+TransformedActionPendulum = transform_action_space("Pendulum-v1")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/windy_maze_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/windy_maze_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a86fe4f9069a50ffb338b7ad3e7779a3260fb1f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/windy_maze_env.py
@@ -0,0 +1,159 @@
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete, Tuple
+import logging
+import random
+
+from ray.rllib.env import MultiAgentEnv
+
+logger = logging.getLogger(__name__)
+
+# Agent has to traverse the maze from the starting position S -> F
+# Observation space [x_pos, y_pos, wind_direction]
+# Action space: stay still OR move in current wind direction
+MAP_DATA = """
+#########
+#S      #
+####### #
+      # #
+      # #
+####### #
+#F      #
+#########"""
+
+
+class WindyMazeEnv(gym.Env):
+    def __init__(self, env_config):
+        self.map = [m for m in MAP_DATA.split("\n") if m]
+        self.x_dim = len(self.map)
+        self.y_dim = len(self.map[0])
+        logger.info("Loaded map {} {}".format(self.x_dim, self.y_dim))
+        for x in range(self.x_dim):
+            for y in range(self.y_dim):
+                if self.map[x][y] == "S":
+                    self.start_pos = (x, y)
+                elif self.map[x][y] == "F":
+                    self.end_pos = (x, y)
+        logger.info("Start pos {} end pos {}".format(self.start_pos, self.end_pos))
+        self.observation_space = Tuple(
+            [
+                Box(0, 100, shape=(2,)),  # (x, y)
+                Discrete(4),  # wind direction (N, E, S, W)
+            ]
+        )
+        self.action_space = Discrete(2)  # whether to move or not
+
+    def reset(self, *, seed=None, options=None):
+        self.wind_direction = random.choice([0, 1, 2, 3])
+        self.pos = self.start_pos
+        self.num_steps = 0
+        return [[self.pos[0], self.pos[1]], self.wind_direction], {}
+
+    def step(self, action):
+        if action == 1:
+            self.pos = self._get_new_pos(self.pos, self.wind_direction)
+        self.num_steps += 1
+        self.wind_direction = random.choice([0, 1, 2, 3])
+        at_goal = self.pos == self.end_pos
+        truncated = self.num_steps >= 200
+        done = at_goal or truncated
+        return (
+            [[self.pos[0], self.pos[1]], self.wind_direction],
+            100 * int(at_goal),
+            done,
+            truncated,
+            {},
+        )
+
+    def _get_new_pos(self, pos, direction):
+        if direction == 0:
+            new_pos = (pos[0] - 1, pos[1])
+        elif direction == 1:
+            new_pos = (pos[0], pos[1] + 1)
+        elif direction == 2:
+            new_pos = (pos[0] + 1, pos[1])
+        elif direction == 3:
+            new_pos = (pos[0], pos[1] - 1)
+        if (
+            new_pos[0] >= 0
+            and new_pos[0] < self.x_dim
+            and new_pos[1] >= 0
+            and new_pos[1] < self.y_dim
+            and self.map[new_pos[0]][new_pos[1]] != "#"
+        ):
+            return new_pos
+        else:
+            return pos  # did not move
+
+
+class HierarchicalWindyMazeEnv(MultiAgentEnv):
+    def __init__(self, env_config):
+        super().__init__()
+        self.flat_env = WindyMazeEnv(env_config)
+
+    def reset(self, *, seed=None, options=None):
+        self.cur_obs, infos = self.flat_env.reset()
+        self.current_goal = None
+        self.steps_remaining_at_level = None
+        self.num_high_level_steps = 0
+        # current low level agent id. This must be unique for each high level
+        # step since agent ids cannot be reused.
+        self.low_level_agent_id = "low_level_{}".format(self.num_high_level_steps)
+        return {
+            "high_level_agent": self.cur_obs,
+        }, {"high_level_agent": infos}
+
+    def step(self, action_dict):
+        assert len(action_dict) == 1, action_dict
+        if "high_level_agent" in action_dict:
+            return self._high_level_step(action_dict["high_level_agent"])
+        else:
+            return self._low_level_step(list(action_dict.values())[0])
+
+    def _high_level_step(self, action):
+        logger.debug("High level agent sets goal")
+        self.current_goal = action
+        self.steps_remaining_at_level = 25
+        self.num_high_level_steps += 1
+        self.low_level_agent_id = "low_level_{}".format(self.num_high_level_steps)
+        obs = {self.low_level_agent_id: [self.cur_obs, self.current_goal]}
+        rew = {self.low_level_agent_id: 0}
+        done = truncated = {"__all__": False}
+        return obs, rew, done, truncated, {}
+
+    def _low_level_step(self, action):
+        logger.debug("Low level agent step {}".format(action))
+        self.steps_remaining_at_level -= 1
+        cur_pos = tuple(self.cur_obs[0])
+        goal_pos = self.flat_env._get_new_pos(cur_pos, self.current_goal)
+
+        # Step in the actual env
+        f_obs, f_rew, f_terminated, f_truncated, info = self.flat_env.step(action)
+        new_pos = tuple(f_obs[0])
+        self.cur_obs = f_obs
+
+        # Calculate low-level agent observation and reward
+        obs = {self.low_level_agent_id: [f_obs, self.current_goal]}
+        if new_pos != cur_pos:
+            if new_pos == goal_pos:
+                rew = {self.low_level_agent_id: 1}
+            else:
+                rew = {self.low_level_agent_id: -1}
+        else:
+            rew = {self.low_level_agent_id: 0}
+
+        # Handle env termination & transitions back to higher level.
+        terminated = {"__all__": False}
+        truncated = {"__all__": False}
+        if f_terminated or f_truncated:
+            terminated["__all__"] = f_terminated
+            truncated["__all__"] = f_truncated
+            logger.debug("high level final reward {}".format(f_rew))
+            rew["high_level_agent"] = f_rew
+            obs["high_level_agent"] = f_obs
+        elif self.steps_remaining_at_level == 0:
+            terminated[self.low_level_agent_id] = True
+            truncated[self.low_level_agent_id] = False
+            rew["high_level_agent"] = 0
+            obs["high_level_agent"] = f_obs
+
+        return obs, rew, terminated, truncated, {self.low_level_agent_id: info}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_env_render_method.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_env_render_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..77216ea179cc77bf270016f7960ebda616faafa7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_env_render_method.py
@@ -0,0 +1,200 @@
+"""Example of implementing a custom `render()` method for your gymnasium RL environment.
+
+This example:
+    - shows how to write a simple gym.Env class yourself, in this case a corridor env,
+    in which the agent starts at the left side of the corridor and has to reach the
+    goal state all the way at the right.
+    - in particular, the new class overrides the Env's `render()` method to show, how
+    you can write your own rendering logic.
+    - furthermore, we use the RLlib callbacks class introduced in this example here:
+    https://github.com/ray-project/ray/blob/master/rllib/examples/envs/env_rendering_and_recording.py  # noqa
+    in order to compile videos of the worst and best performing episodes in each
+    iteration and log these videos to your WandB account, so you can view them.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+--wandb-key=[your WandB API key] --wandb-project=[some WandB project name]
+--wandb-run-name=[optional: WandB run name within --wandb-project]`
+
+In order to see the actual videos, you need to have a WandB account and provide your
+API key and a project name on the command line (see above).
+
+Use the `--num-agents` argument to set up the env as a multi-agent env. If
+`--num-agents` > 0, RLlib will simply run as many of the defined single-agent
+environments in parallel and with different policies to be trained for each agent.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+
+Results to expect
+-----------------
+After the first training iteration, you should see the videos in your WandB account
+under the provided `--wandb-project` name. Filter for "videos_best" or "videos_worst".
+
+Note that the default Tune TensorboardX (TBX) logger might complain about the videos
+being logged. This is ok, the TBX logger will simply ignore these. The WandB logger,
+however, will recognize the video tensors shaped
+(1 [batch], T [video len], 3 [rgb], [height], [width]) and properly create a WandB video
+object to be sent to their server.
+
+Your terminal output should look similar to this (the following is for a
+`--num-agents=2` run; expect similar results for the other `--num-agents`
+settings):
++---------------------+------------+----------------+--------+------------------+
+| Trial name          | status     | loc            |   iter |   total time (s) |
+|---------------------+------------+----------------+--------+------------------+
+| PPO_env_fb1c0_00000 | TERMINATED | 127.0.0.1:8592 |      3 |          21.1876 |
++---------------------+------------+----------------+--------+------------------+
++-------+-------------------+-------------+-------------+
+|    ts |   combined return |   return p1 |   return p0 |
+|-------+-------------------+-------------+-------------|
+| 12000 |           12.7655 |      7.3605 |      5.4095 |
++-------+-------------------+-------------+-------------+
+"""
+
+import gymnasium as gym
+import numpy as np
+from gymnasium.spaces import Box, Discrete
+from PIL import Image, ImageDraw
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.env.multi_agent_env import make_multi_agent
+from ray.rllib.examples.envs.env_rendering_and_recording import EnvRenderCallback
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray import tune
+
+parser = add_rllib_example_script_args(
+    default_iters=10,
+    default_reward=9.0,
+    default_timesteps=10000,
+)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+class CustomRenderedCorridorEnv(gym.Env):
+    """Example of a custom env, for which we specify rendering behavior."""
+
+    def __init__(self, config):
+        self.end_pos = config.get("corridor_length", 10)
+        self.max_steps = config.get("max_steps", 100)
+        self.cur_pos = 0
+        self.steps = 0
+        self.action_space = Discrete(2)
+        self.observation_space = Box(0.0, 999.0, shape=(1,), dtype=np.float32)
+
+    def reset(self, *, seed=None, options=None):
+        self.cur_pos = 0.0
+        self.steps = 0
+        return np.array([self.cur_pos], np.float32), {}
+
+    def step(self, action):
+        self.steps += 1
+        assert action in [0, 1], action
+        if action == 0 and self.cur_pos > 0:
+            self.cur_pos -= 1.0
+        elif action == 1:
+            self.cur_pos += 1.0
+        truncated = self.steps >= self.max_steps
+        terminated = self.cur_pos >= self.end_pos
+        return (
+            np.array([self.cur_pos], np.float32),
+            10.0 if terminated else -0.1,
+            terminated,
+            truncated,
+            {},
+        )
+
+    def render(self) -> np._typing.NDArray[np.uint8]:
+        """Implements rendering logic for this env (given the current observation).
+
+        You should return a numpy RGB image like so:
+        np.array([height, width, 3], dtype=np.uint8).
+
+        Returns:
+            np.ndarray: A numpy uint8 3D array (image) to render.
+        """
+        # Image dimensions.
+        # Each position in the corridor is 50 pixels wide.
+        width = (self.end_pos + 2) * 50
+        # Fixed height of the image.
+        height = 100
+
+        # Create a new image with white background
+        image = Image.new("RGB", (width, height), "white")
+        draw = ImageDraw.Draw(image)
+
+        # Draw the corridor walls
+        # Grey rectangle for the corridor.
+        draw.rectangle([50, 30, width - 50, 70], fill="grey")
+
+        # Draw the agent.
+        # Calculate the x coordinate of the agent.
+        agent_x = (self.cur_pos + 1) * 50
+        # Blue rectangle for the agent.
+        draw.rectangle([agent_x + 10, 40, agent_x + 40, 60], fill="blue")
+
+        # Draw the goal state.
+        # Calculate the x coordinate of the goal.
+        goal_x = self.end_pos * 50
+        # Green rectangle for the goal state.
+        draw.rectangle([goal_x + 10, 40, goal_x + 40, 60], fill="green")
+
+        # Convert the image to a uint8 numpy array.
+        return np.array(image, dtype=np.uint8)
+
+
+# Create a simple multi-agent version of the above Env by duplicating the single-agent
+# env n (n=num agents) times and having the agents act independently, each one in a
+# different corridor.
+MultiAgentCustomRenderedCorridorEnv = make_multi_agent(
+    lambda config: CustomRenderedCorridorEnv(config)
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # The `config` arg passed into our Env's constructor (see the class' __init__ method
+    # above). Feel free to change these.
+    env_options = {
+        "corridor_length": 10,
+        "max_steps": 100,
+        "num_agents": args.num_agents,  # <- only used by the multu-agent version.
+    }
+
+    env_cls_to_use = (
+        CustomRenderedCorridorEnv
+        if args.num_agents == 0
+        else MultiAgentCustomRenderedCorridorEnv
+    )
+
+    tune.register_env("env", lambda _: env_cls_to_use(env_options))
+
+    # Example config switching on rendering.
+    base_config = (
+        PPOConfig()
+        # Configure our env to be the above-registered one.
+        .environment("env")
+        # Plugin our env-rendering (and logging) callback. This callback class allows
+        # you to fully customize your rendering behavior (which workers should render,
+        # which episodes, which (vector) env indices, etc..). We refer to this example
+        # script here for further details:
+        # https://github.com/ray-project/ray/blob/master/rllib/examples/envs/env_rendering_and_recording.py  # noqa
+        .callbacks(EnvRenderCallback)
+    )
+
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, eps, **kw: f"p{aid}",
+        )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_gym_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_gym_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..2612575adb63749e536db0e3e6b79a5fbdb39247
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_gym_env.py
@@ -0,0 +1,162 @@
+"""Example of defining a custom gymnasium Env to be learned by an RLlib Algorithm.
+
+This example:
+    - demonstrates how to write your own (single-agent) gymnasium Env class, define its
+    physics and mechanics, the reward function used, the allowed actions (action space),
+    and the type of observations (observation space), etc..
+    - shows how to configure and setup this environment class within an RLlib
+    Algorithm config.
+    - runs the experiment with the configured algo, trying to solve the environment.
+
+To see more details on which env we are building for this example, take a look at the
+`SimpleCorridor` class defined below.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--corridor-length` option to set a custom length for the corridor. Note that
+for extremely long corridors, the algorithm should take longer to learn.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see results similar to the following in your console output:
+
++--------------------------------+------------+-----------------+--------+
+| Trial name                     | status     | loc             |   iter |
+|--------------------------------+------------+-----------------+--------+
+| PPO_SimpleCorridor_78714_00000 | TERMINATED | 127.0.0.1:85794 |      7 |
++--------------------------------+------------+-----------------+--------+
+
++------------------+-------+----------+--------------------+
+|   total time (s) |    ts |   reward |   episode_len_mean |
+|------------------+-------+----------+--------------------|
+|          18.3034 | 28000 | 0.908918 |            12.9676 |
++------------------+-------+----------+--------------------+
+"""
+# These tags allow extracting portions of this script on Anyscale.
+# ws-template-imports-start
+import gymnasium as gym
+from gymnasium.spaces import Discrete, Box
+import numpy as np
+import random
+
+from typing import Optional
+
+# ws-template-imports-end
+
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env  # noqa
+
+
+parser = add_rllib_example_script_args(
+    default_reward=0.9, default_iters=50, default_timesteps=100000
+)
+parser.add_argument(
+    "--corridor-length",
+    type=int,
+    default=10,
+    help="The length of the corridor in fields. Note that this number includes the "
+    "starting- and goal states.",
+)
+
+
+# These tags allow extracting portions of this script on Anyscale.
+# ws-template-code-start
+class SimpleCorridor(gym.Env):
+    """Example of a custom env in which the agent has to walk down a corridor.
+
+    ------------
+    |S........G|
+    ------------
+    , where S is the starting position, G is the goal position, and fields with '.'
+    mark free spaces, over which the agent may step. The length of the above example
+    corridor is 10.
+    Allowed actions are left (0) and right (1).
+    The reward function is -0.01 per step taken and a uniform random value between
+    0.5 and 1.5 when reaching the goal state.
+
+    You can configure the length of the corridor via the env's config. Thus, in your
+    AlgorithmConfig, you can do:
+    `config.environment(env_config={"corridor_length": ..})`.
+    """
+
+    def __init__(self, config: Optional[dict] = None):
+        config = config or {}
+        self.end_pos = config.get("corridor_length", 7)
+        self.cur_pos = 0
+        self.action_space = Discrete(2)
+        self.observation_space = Box(0.0, self.end_pos, shape=(1,), dtype=np.float32)
+
+    def reset(self, *, seed=None, options=None):
+        random.seed(seed)
+        self.cur_pos = 0
+        # Return obs and (empty) info dict.
+        return np.array([self.cur_pos], np.float32), {"env_state": "reset"}
+
+    def step(self, action):
+        assert action in [0, 1], action
+        # Move left.
+        if action == 0 and self.cur_pos > 0:
+            self.cur_pos -= 1
+        # Move right.
+        elif action == 1:
+            self.cur_pos += 1
+
+        # The environment only ever terminates when we reach the goal state.
+        terminated = self.cur_pos >= self.end_pos
+        truncated = False
+        # Produce a random reward from [0.5, 1.5] when we reach the goal.
+        reward = random.uniform(0.5, 1.5) if terminated else -0.01
+        infos = {}
+        return (
+            np.array([self.cur_pos], np.float32),
+            reward,
+            terminated,
+            truncated,
+            infos,
+        )
+
+
+# ws-template-code-end
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Can also register the env creator function explicitly with:
+    # register_env("corridor-env", lambda config: SimpleCorridor())
+
+    # Or you can hard code certain settings into the Env's constructor (`config`).
+    # register_env(
+    #    "corridor-env-w-len-100",
+    #    lambda config: SimpleCorridor({**config, **{"corridor_length": 100}}),
+    # )
+
+    # Or allow the RLlib user to set more c'tor options via their algo config:
+    # config.environment(env_config={[c'tor arg name]: [value]})
+    # register_env("corridor-env", lambda config: SimpleCorridor(config))
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            SimpleCorridor,  # or provide the registered string: "corridor-env"
+            env_config={"corridor_length": args.corridor_length},
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d71ad95573f052a9ff03258086c6be787f22fbc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py
@@ -0,0 +1,130 @@
+"""Example of running against a TCP-connected external env performing its own inference.
+
+The example uses a custom EnvRunner (TcpClientInferenceEnvRunner) to allow
+connections from one or more TCP clients to RLlib's EnvRunner actors, which act as
+RL servers.
+In this example, action inference for stepping the env is performed on the client's
+side, meaning the client computes all actions itself, applies them to the env logic,
+collects episodes of experiences, and sends these (in bulk) back to RLlib for training.
+Also, from time to time, the updated model weights have to be sent from RLlib (server)
+back to the connected clients.
+Note that RLlib's new API stack does not yet support individual action requests, where
+action computations happen on the RLlib (server) side.
+
+This example:
+    - demonstrates how RLlib can be hooked up to an externally running complex simulator
+    through TCP connections.
+    - shows how a custom EnvRunner subclass can be configured allowing users to
+    implement their own logic of connecting to external processes and customize the
+    messaging protocols.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --port 5555
+
+Use the `--port` option to change the default port (5555) to some other value.
+Make sure that you do the same on the client side.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see something like this on your terminal. Note that the dummy CartPole
+client (which runs in a thread for the purpose of this example here) might throw
+a disconnection error at the end, b/c RLlib closes the server socket when done training.
+
++----------------------+------------+--------+------------------+
+| Trial name           | status     |   iter |   total time (s) |
+|                      |            |        |                  |
+|----------------------+------------+--------+------------------+
+| PPO_None_3358e_00000 | TERMINATED |     40 |          32.2649 |
++----------------------+------------+--------+------------------+
++------------------------+------------------------+
+|  episode_return_mean  |   num_env_steps_sample |
+|                       |             d_lifetime |
+|-----------------------+------------------------|
+|                458.68 |                 160000 |
++-----------------------+------------------------+
+
+From the dummy client (thread), you should see at the end:
+```
+ConnectionError: Error receiving message from peer on socket ...
+```
+"""
+from functools import partial
+import threading
+
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.env.tcp_client_inference_env_runner import (
+    _dummy_client,
+    TcpClientInferenceEnvRunner,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = add_rllib_example_script_args(
+    default_reward=450.0, default_iters=200, default_timesteps=2000000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_env_runners=1,
+)
+parser.add_argument(
+    "--port",
+    type=int,
+    default=5555,
+    help="The port for RLlib's EnvRunner to listen to for incoming UE5 connections. "
+    "You need to specify the same port inside your UE5 `RLlibClient` plugin.",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Start the dummy CartPole client in a thread (and do its thing in parallel).
+    client_thread = threading.Thread(
+        target=partial(
+            _dummy_client,
+            port=args.port
+            + (args.num_env_runners if args.num_env_runners is not None else 1),
+        ),
+    )
+    client_thread.start()
+
+    # Define the RLlib (server) config.
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            observation_space=gym.spaces.Box(-1.0, 1.0, (4,), np.float32),
+            action_space=gym.spaces.Discrete(2),
+            # EnvRunners listen on `port` + their worker index.
+            env_config={"port": args.port},
+        )
+        .env_runners(
+            # Point RLlib to the custom EnvRunner to be used here.
+            env_runner_cls=TcpClientInferenceEnvRunner,
+        )
+        .training(
+            num_epochs=10,
+            vf_loss_coeff=0.01,
+        )
+        .rl_module(model_config=DefaultModelConfig(vf_share_layers=True))
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_rendering_and_recording.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_rendering_and_recording.py
new file mode 100644
index 0000000000000000000000000000000000000000..a74687ea92a5936f4f775e650b07e96b2183e152
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_rendering_and_recording.py
@@ -0,0 +1,300 @@
+"""Example of using a custom Callback to render and log episode videos from a gym.Env.
+
+This example:
+    - shows how to set up your (Atari) gym.Env for human-friendly rendering inside the
+    `AlgorithmConfig.environment()` method.
+    - demonstrates how to write an RLlib custom callback class that renders all envs on
+    all timesteps, stores the individual images temporarily in the Episode
+    objects, and compiles a video from these images once the Episode terminates.
+    - furthermore, in each sampling cycle (iteration), the callback uses the unified
+    `MetricsLogger` facility - available in all RLlib components - to log the video of
+    the best performing and worst performing episode and sends these videos to WandB.
+    - configures the above callbacks class within the AlgorithmConfig.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --env [env name e.g. 'ALE/Pong-v5']
+--wandb-key=[your WandB API key] --wandb-project=[some WandB project name]
+--wandb-run-name=[optional: WandB run name within --wandb-project]`
+
+In order to see the actual videos, you need to have a WandB account and provide your
+API key and a project name on the command line (see above). To log the videos in WandB
+you need to have the `wandb` and `moviepy` packages installed (`pip install wandb
+moviepy`).
+
+Use the `--env` flag to control, which Atari env is used. Note that this example
+only works with Atari envs.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+
+Results to expect
+-----------------
+After the first training iteration, you should see the videos in your WandB account
+under the provided `--wandb-project` name. Filter for "videos_best" or "videos_worst".
+
+Note that the default Tune TensorboardX (TBX) logger might complain about the videos
+being logged. This is ok, the TBX logger will simply ignore these. The WandB logger,
+however, will recognize the video tensors shaped
+(1 [batch], T [video len], 3 [rgb], [height], [width]) and properly create a WandB video
+object to be sent to their server.
+
+Your terminal output should look similar to this:
++---------------------+----------+-----------------+--------+------------------+
+| Trial name          | status   | loc             |   iter |   total time (s) |
+|                     |          |                 |        |                  |
+|---------------------+----------+-----------------+--------+------------------+
+| PPO_env_8d3f3_00000 | RUNNING  | 127.0.0.1:89991 |      1 |          239.633 |
++---------------------+----------+-----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|   num_env_steps_sample |   num_env_steps_traine |   num_episodes_lifetim |
+|             d_lifetime |             d_lifetime |                      e |
++------------------------+------------------------+------------------------|
+|                   4000 |                   4000 |                     24 |
++------------------------+------------------------+------------------------+
+"""
+import gymnasium as gym
+import numpy as np
+from typing import Optional, Sequence
+
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
+from ray.rllib.utils.images import resize
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+from ray import tune
+
+parser = add_rllib_example_script_args(default_reward=20.0)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    env="ale_py:ALE/Pong-v5",
+)
+
+
+class EnvRenderCallback(RLlibCallback):
+    """A custom callback to render the environment.
+
+    This can be used to create videos of the episodes for some or all EnvRunners
+    and some or all env indices (in a vectorized env). These videos can then
+    be sent to e.g. WandB as shown in this example script here.
+
+    We override the `on_episode_step` method to create a single ts render image
+    and temporarily store it in the Episode object.
+    """
+
+    def __init__(self, env_runner_indices: Optional[Sequence[int]] = None):
+        """Initializes an EnvRenderCallback instance.
+
+        Args:
+            env_runner_indices: The (optional) EnvRunner indices, for this callback
+                should be active. If None, activates the rendering for all EnvRunners.
+                If a Sequence type, only renders, if the EnvRunner index is found in
+                `env_runner_indices`.
+        """
+        super().__init__()
+        # Only render and record on certain EnvRunner indices?
+        self._env_runner_indices = env_runner_indices
+        # Per sample round (on this EnvRunner), we want to only log the best- and
+        # worst performing episode's videos in the custom metrics. Otherwise, too much
+        # data would be sent to WandB.
+        self.best_episode_and_return = (None, float("-inf"))
+        self.worst_episode_and_return = (None, float("inf"))
+
+    def on_episode_step(
+        self,
+        *,
+        episode,
+        env_runner,
+        metrics_logger,
+        env,
+        env_index,
+        rl_module,
+        **kwargs,
+    ) -> None:
+        """Adds current render image to episode's temporary data.
+
+        Note that this would work with MultiAgentEpisodes as well.
+        """
+        # Skip, if this EnvRunner's index is not in `self._env_runner_indices`.
+        if (
+            self._env_runner_indices is not None
+            and env_runner.worker_index not in self._env_runner_indices
+        ):
+            return
+
+        # If we have a vector env, only render the sub-env at index 0.
+        if isinstance(env.unwrapped, gym.vector.VectorEnv):
+            image = env.unwrapped.envs[0].render()
+        # Render the gym.Env.
+        else:
+            image = env.unwrapped.render()
+
+        # Original render images for CartPole are 400x600 (hxw). We'll downsize here to
+        # a very small dimension (to save space and bandwidth).
+        image = resize(image, 64, 96)
+        # For WandB videos, we need to put channels first.
+        image = np.transpose(image, axes=[2, 0, 1])
+        # Add the compiled single-step image as temp. data to our Episode object.
+        # Once the episode is done, we'll compile the video from all logged images
+        # and log the video with the EnvRunner's `MetricsLogger.log_...()` APIs.
+        # See below:
+        # `on_episode_end()`: We compile the video and maybe store it).
+        # `on_sample_end()` We log the best and worst video to the `MetricsLogger`.
+        episode.add_temporary_timestep_data("render_images", image)
+
+    def on_episode_end(
+        self,
+        *,
+        episode,
+        env_runner,
+        metrics_logger,
+        env,
+        env_index,
+        rl_module,
+        **kwargs,
+    ) -> None:
+        """Computes episode's return and compiles a video, iff best/worst in this iter.
+
+        Note that the actual logging to the EnvRunner's MetricsLogger only happens
+        at the very env of sampling (when we know, which episode was the best and
+        worst). See `on_sample_end` for the implemented logging logic.
+        """
+        if (
+            self._env_runner_indices is not None
+            and env_runner.worker_index not in self._env_runner_indices
+        ):
+            return
+
+        # Get the episode's return.
+        episode_return = episode.get_return()
+
+        # Better than the best or worse than worst Episode thus far?
+        if (
+            episode_return > self.best_episode_and_return[1]
+            or episode_return < self.worst_episode_and_return[1]
+        ):
+            # Pull all images from the temp. data of the episode.
+            images = episode.get_temporary_timestep_data("render_images")
+            # `images` is now a list of 3D ndarrays
+
+            # Create a video from the images by simply stacking them AND
+            # adding an extra B=1 dimension. Note that Tune's WandB logger currently
+            # knows how to log the different data types by the following rules:
+            # array is shape=3D -> An image (c, h, w).
+            # array is shape=4D -> A batch of images (B, c, h, w).
+            # array is shape=5D -> A video (1, L, c, h, w), where L is the length of the
+            # video.
+            # -> Make our video ndarray a 5D one.
+            video = np.expand_dims(np.stack(images, axis=0), axis=0)
+
+            # `video` is from the best episode in this cycle (iteration).
+            if episode_return > self.best_episode_and_return[1]:
+                self.best_episode_and_return = (video, episode_return)
+            # `video` is worst in this cycle (iteration).
+            else:
+                self.worst_episode_and_return = (video, episode_return)
+
+    def on_sample_end(
+        self,
+        *,
+        env_runner,
+        metrics_logger,
+        samples,
+        **kwargs,
+    ) -> None:
+        """Logs the best and worst video to this EnvRunner's MetricsLogger."""
+        # Best video.
+        if self.best_episode_and_return[0] is not None:
+            metrics_logger.log_value(
+                "episode_videos_best",
+                self.best_episode_and_return[0],
+                # Do not reduce the videos (across the various parallel EnvRunners).
+                # This would not make sense (mean over the pixels?). Instead, we want to
+                # log all best videos of all EnvRunners per iteration.
+                reduce=None,
+                # B/c we do NOT reduce over the video data (mean/min/max), we need to
+                # make sure the list of videos in our MetricsLogger does not grow
+                # infinitely and gets cleared after each `reduce()` operation, meaning
+                # every time, the EnvRunner is asked to send its logged metrics.
+                clear_on_reduce=True,
+            )
+            self.best_episode_and_return = (None, float("-inf"))
+        # Worst video.
+        if self.worst_episode_and_return[0] is not None:
+            metrics_logger.log_value(
+                "episode_videos_worst",
+                self.worst_episode_and_return[0],
+                # Same logging options as above.
+                reduce=None,
+                clear_on_reduce=True,
+            )
+            self.worst_episode_and_return = (None, float("inf"))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Register our environment with tune.
+    def _env_creator(cfg):
+        cfg.update({"render_mode": "rgb_array"})
+        if args.env.startswith("ale_py:ALE/"):
+            cfg.update(
+                {
+                    # Make analogous to old v4 + NoFrameskip.
+                    "frameskip": 1,
+                    "full_action_space": False,
+                    "repeat_action_probability": 0.0,
+                }
+            )
+            return wrap_atari_for_new_api_stack(gym.make(args.env, **cfg), framestack=4)
+        else:
+            return gym.make(args.env, **cfg)
+
+    register_env("env", _env_creator)
+
+    base_config = (
+        get_trainable_cls(args.algo).get_default_config()
+        # Use the above-registered environment.
+        .environment("env")
+        # Plug in our custom callback that controls, which videos are created (best,
+        # and worst per sampling cycle per EnvRunner) and then logged via the
+        # `MetricsLogger` API.
+        .callbacks(EnvRenderCallback)
+        # Switch off RLlib's logging to avoid having the large videos show up in any log
+        # files.
+        .debugging(logger_config={"type": tune.logger.NoopLogger})
+        # The following settings are beneficial for Atari-type environments. Feel free
+        # to adjust these when providing a non-Atari `--env` option.
+        .training(
+            lambda_=0.95,
+            kl_coeff=0.5,
+            clip_param=0.1,
+            vf_clip_param=10.0,
+            entropy_coeff=0.01,
+            num_epochs=10,
+            # Linearly adjust learning rate based on number of GPUs.
+            lr=0.00015 * (args.num_learners or 1),
+            grad_clip=100.0,
+            grad_clip_by="global_norm",
+        )
+    )
+
+    if base_config.is_atari:
+        base_config.rl_module(
+            model_config=DefaultModelConfig(
+                conv_filters=[[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]],
+                conv_activation="relu",
+                head_fcnet_hiddens=[256],
+                vf_share_layers=True,
+            ),
+        )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_w_protobuf_observations.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_w_protobuf_observations.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1fec7c753161f9e7a89a5f1d78ce4560d47f13d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_w_protobuf_observations.py
@@ -0,0 +1,78 @@
+"""Example of handling an Env that outputs protobuf observations.
+
+This example:
+    - demonstrates how a custom Env can use protobufs to compress its observation into
+    a binary format to save space and gain performance.
+    - shows how to use a very simple ConnectorV2 piece that translates these protobuf
+    binary observation strings into proper more NN-readable observations (like a 1D
+    float32 tensor).
+
+To see more details on which env we are building for this example, take a look at the
+`CartPoleWithProtobufObservationSpace` class imported below.
+To see more details on which ConnectorV2 piece we are plugging into the config
+below, take a look at the `ProtobufCartPoleObservationDecoder` class imported below.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see results similar to the following in your console output:
+
++------------------------------------------------------+------------+-----------------+
+| Trial name                                           | status     | loc             |
+|                                                      |            |                 |
+|------------------------------------------------------+------------+-----------------+
+| PPO_CartPoleWithProtobufObservationSpace_47dd2_00000 | TERMINATED | 127.0.0.1:67325 |
++------------------------------------------------------+------------+-----------------+
++--------+------------------+------------------------+------------------------+
+|   iter |   total time (s) |   episode_return_mean  |   num_episodes_lifetim |
+|        |                  |                        |                      e |
++--------+------------------+------------------------+------------------------+
+|     17 |          39.9011 |                 513.29 |                    465 |
++--------+------------------+------------------------+------------------------+
+"""
+from ray.rllib.examples.connectors.classes.protobuf_cartpole_observation_decoder import (  # noqa
+    ProtobufCartPoleObservationDecoder,
+)
+from ray.rllib.examples.envs.classes.cartpole_with_protobuf_observation_space import (
+    CartPoleWithProtobufObservationSpace,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+
+parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        get_trainable_cls(args.algo).get_default_config()
+        # Set up the env to be CartPole-v1, but with protobuf observations.
+        .environment(CartPoleWithProtobufObservationSpace)
+        # Plugin our custom ConnectorV2 piece to translate protobuf observations
+        # (box of dtype uint8) into NN-readible ones (1D tensor of dtype flaot32).
+        .env_runners(
+            env_to_module_connector=lambda env: ProtobufCartPoleObservationDecoder(),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/greyscale_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/greyscale_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f0e5ffc956002a9cc2fbef3569aab2911c9cafe
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/greyscale_env.py
@@ -0,0 +1,127 @@
+# @OldAPIStack
+"""
+Example of interfacing with an environment that produces 2D observations.
+
+This example shows how turning 2D observations with shape (A, B) into a 3D
+observations with shape (C, D, 1) can enable usage of RLlib's default models.
+RLlib's default Catalog class does not provide default models for 2D observation
+spaces, but it does so for 3D observations.
+Therefore, one can either write a custom model or transform the 2D observations into 3D
+observations. This enables RLlib to use one of the default CNN filters, even though the
+original observation space of the environment does not fit them.
+
+This simple example should reach rewards of 50 within 150k timesteps.
+"""
+
+from numpy import float32
+import argparse
+from pettingzoo.butterfly import pistonball_v6
+from supersuit import (
+    normalize_obs_v0,
+    dtype_v0,
+    color_reduction_v0,
+    reshape_v0,
+    resize_v1,
+)
+
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.env import PettingZooEnv
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.tune.registry import register_env
+from ray import tune
+from ray import air
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--framework",
+    choices=["tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a compilation test.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=150, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train."
+)
+parser.add_argument(
+    "--stop-reward", type=float, default=50, help="Reward at which we stop training."
+)
+
+args = parser.parse_args()
+
+
+# The space we down-sample and transform the greyscale pistonball images to.
+# Other spaces supported by RLlib can be chosen here.
+TRANSFORMED_OBS_SPACE = (42, 42, 1)
+
+
+def env_creator(config):
+    env = pistonball_v6.env(n_pistons=5)
+    env = dtype_v0(env, dtype=float32)
+    # This gives us greyscale images for the color red
+    env = color_reduction_v0(env, mode="R")
+    env = normalize_obs_v0(env)
+    # This gives us images that are upsampled to the number of pixels in the
+    # default CNN filter
+    env = resize_v1(
+        env, x_size=TRANSFORMED_OBS_SPACE[0], y_size=TRANSFORMED_OBS_SPACE[1]
+    )
+    # This gives us 3D images for which we have default filters
+    env = reshape_v0(env, shape=TRANSFORMED_OBS_SPACE)
+    return env
+
+
+# Register env
+register_env("pistonball", lambda config: PettingZooEnv(env_creator(config)))
+
+config = (
+    PPOConfig()
+    .environment("pistonball", env_config={"local_ratio": 0.5}, clip_rewards=True)
+    .env_runners(
+        num_env_runners=15 if not args.as_test else 2,
+        num_envs_per_env_runner=1,
+        observation_filter="NoFilter",
+        rollout_fragment_length="auto",
+    )
+    .framework("torch")
+    .training(
+        entropy_coeff=0.01,
+        vf_loss_coeff=0.1,
+        clip_param=0.1,
+        vf_clip_param=10.0,
+        num_epochs=10,
+        kl_coeff=0.5,
+        lr=0.0001,
+        grad_clip=100,
+        minibatch_size=500,
+        train_batch_size=5000 if not args.as_test else 1000,
+        model={"vf_share_layers": True},
+    )
+    .resources(num_gpus=1 if not args.as_test else 0)
+    .reporting(min_time_s_per_iteration=30)
+)
+
+tune.Tuner(
+    "PPO",
+    param_space=config.to_dict(),
+    run_config=air.RunConfig(
+        stop={
+            TRAINING_ITERATION: args.stop_iters,
+            NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+            f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+        },
+        verbose=2,
+    ),
+).fit()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/unity3d_env_local.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/unity3d_env_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..d334125ee4e81d0021ee5a167d591ce0ed991944
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/unity3d_env_local.py
@@ -0,0 +1,213 @@
+# @OldAPIStack
+
+"""
+Example of running an RLlib Algorithm against a locally running Unity3D editor
+instance (available as Unity3DEnv inside RLlib).
+For a distributed cloud setup example with Unity,
+see `examples/envs/external_envs/unity3d_[server|client].py`
+
+To run this script against a local Unity3D engine:
+1) Install Unity3D and `pip install mlagents`.
+
+2) Open the Unity3D Editor and load an example scene from the following
+   ml-agents pip package location:
+   `.../ml-agents/Project/Assets/ML-Agents/Examples/`
+   This script supports the `3DBall`, `3DBallHard`, `SoccerStrikersVsGoalie`,
+    `Tennis`, and `Walker` examples.
+   Specify the game you chose on your command line via e.g. `--env 3DBall`.
+   Feel free to add more supported examples here.
+
+3) Then run this script (you will have to press Play in your Unity editor
+   at some point to start the game and the learning process):
+$ python unity3d_env_local.py --env 3DBall --stop-reward [..]
+  [--framework=torch]?
+"""
+
+import argparse
+import os
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import check_learning_achieved
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--env",
+    type=str,
+    default="3DBall",
+    choices=[
+        "3DBall",
+        "3DBallHard",
+        "GridFoodCollector",
+        "Pyramids",
+        "SoccerStrikersVsGoalie",
+        "SoccerTwos",
+        "Sorter",
+        "Tennis",
+        "VisualHallway",
+        "Walker",
+    ],
+    help="The name of the Env to run in the Unity3D editor: `3DBall(Hard)?|"
+    "Pyramids|GridFoodCollector|SoccerStrikersVsGoalie|Sorter|Tennis|"
+    "VisualHallway|Walker` (feel free to add more and PR!)",
+)
+parser.add_argument(
+    "--file-name",
+    type=str,
+    default=None,
+    help="The Unity3d binary (compiled) game, e.g. "
+    "'/home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64'. Use `None` for "
+    "a currently running Unity3D editor.",
+)
+parser.add_argument(
+    "--from-checkpoint",
+    type=str,
+    default=None,
+    help="Full path to a checkpoint file for restoring a previously saved "
+    "Algorithm state.",
+)
+parser.add_argument("--num-workers", type=int, default=0)
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=9999, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=10000000, help="Number of timesteps to train."
+)
+parser.add_argument(
+    "--stop-reward",
+    type=float,
+    default=9999.0,
+    help="Reward at which we stop training.",
+)
+parser.add_argument(
+    "--horizon",
+    type=int,
+    default=3000,
+    help="The max. number of `step()`s for any episode (per agent) before "
+    "it'll be reset again automatically.",
+)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+
+if __name__ == "__main__":
+    ray.init()
+
+    args = parser.parse_args()
+
+    tune.register_env(
+        "unity3d",
+        lambda c: Unity3DEnv(
+            file_name=c["file_name"],
+            no_graphics=(args.env != "VisualHallway" and c["file_name"] is not None),
+            episode_horizon=c["episode_horizon"],
+        ),
+    )
+
+    # Get policies (different agent types; "behaviors" in MLAgents) and
+    # the mappings from individual agents to Policies.
+    policies, policy_mapping_fn = Unity3DEnv.get_policy_configs_for_game(args.env)
+
+    config = (
+        PPOConfig()
+        .environment(
+            "unity3d",
+            env_config={
+                "file_name": args.file_name,
+                "episode_horizon": args.horizon,
+            },
+        )
+        .framework("tf" if args.env != "Pyramids" else "torch")
+        # For running in editor, force to use just one Worker (we only have
+        # one Unity running)!
+        .env_runners(
+            num_env_runners=args.num_workers if args.file_name else 0,
+            rollout_fragment_length=200,
+        )
+        .training(
+            lr=0.0003,
+            lambda_=0.95,
+            gamma=0.99,
+            minibatch_size=256,
+            train_batch_size=4000,
+            num_epochs=20,
+            clip_param=0.2,
+            model={"fcnet_hiddens": [512, 512]},
+        )
+        .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn)
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+    )
+
+    # Switch on Curiosity based exploration for Pyramids env
+    # (not solvable otherwise).
+    if args.env == "Pyramids":
+        config.env_runners(
+            exploration_config={
+                "type": "Curiosity",
+                "eta": 0.1,
+                "lr": 0.001,
+                # No actual feature net: map directly from observations to feature
+                # vector (linearly).
+                "feature_net_config": {
+                    "fcnet_hiddens": [],
+                    "fcnet_activation": "relu",
+                },
+                "sub_exploration": {
+                    "type": "StochasticSampling",
+                },
+                "forward_net_activation": "relu",
+                "inverse_net_activation": "relu",
+            }
+        )
+    elif args.env == "GridFoodCollector":
+        config.training(
+            model={
+                "conv_filters": [[16, [4, 4], 2], [32, [4, 4], 2], [256, [10, 10], 1]],
+            }
+        )
+    elif args.env == "Sorter":
+        config.training(model={"use_attention": True})
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+
+    # Run the experiment.
+    results = tune.Tuner(
+        "PPO",
+        param_space=config.to_dict(),
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=1,
+            checkpoint_config=air.CheckpointConfig(
+                checkpoint_frequency=5,
+                checkpoint_at_end=True,
+            ),
+        ),
+    ).fit()
+
+    # And check the results.
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93e13b085d4244d66aa7a768b29c94a580c4ce72
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce5f1d0a9331ed13b9e92b9759ab4dcd3ddf2763
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_w_connector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_w_connector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33cf5aeb9e0be4e53391219cf7315554ca9e75db
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_w_connector.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91927a2cf3353c0d03c695ed048e768b5eff431e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_attention.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_lstm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_lstm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03980cf40af8effca84d2a49ac10f0ab2dc4a7b8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_lstm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ece833c3c53b0c19e7e8f9cef0ddadf35659044
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training.py
@@ -0,0 +1,188 @@
+"""Example on how to compute actions in production on an already trained policy.
+
+This example uses the simplest setup possible: An RLModule (policy net) recovered
+from a checkpoint and a manual env-loop (CartPole-v1). No ConnectorV2s or EnvRunners are
+used in this example.
+
+This example:
+    - shows how to use an already existing checkpoint to extract a single-agent RLModule
+    from (our policy network).
+    - shows how to setup this recovered policy net for action computations (with or
+    without using exploration).
+    - shows have the policy run through a very simple gymnasium based env-loop, w/o
+    using RLlib's ConnectorV2s or EnvRunners.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --stop-reward=200.0`
+
+Use the `--explore-during-inference` option to switch on exploratory behavior
+during inference. Normally, you should not explore during inference, though,
+unless your environment has a stochastic optimal solution.
+Use the `--num-episodes-during-inference=[int]` option to set the number of
+episodes to run through during the inference phase using the restored RLModule.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+Note that the shown GPU settings in this script also work in case you are not
+running via tune, but instead are using the `--no-tune` command line option.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+
+
+Results to expect
+-----------------
+
+For the training step - depending on your `--stop-reward` setting, you should see
+something similar to this:
+
+Number of trials: 1/1 (1 TERMINATED)
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|                             |            |                 |        |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_6660c_00000 | TERMINATED | 127.0.0.1:43566 |      8 |
++-----------------------------+------------+-----------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |   num_env_steps_sample |   num_env_steps_traine |
+|                  |             d_lifetime |             d_lifetime |
++------------------+------------------------+------------------------+
+|          21.0283 |                  32000 |                  32000 |
++------------------+------------------------+------------------------+
+
+Then, after restoring the RLModule for the inference phase, your output should
+look similar to:
+
+Training completed. Restoring new RLModule for action inference.
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Episode done: Total reward = 500.0
+Done performing action inference through 10 Episodes
+"""
+import gymnasium as gym
+import numpy as np
+import os
+
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import convert_to_numpy, softmax
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+torch, _ = try_import_torch()
+
+parser = add_rllib_example_script_args(default_reward=200.0)
+parser.set_defaults(
+    # Make sure that - by default - we produce checkpoints during training.
+    checkpoint_freq=1,
+    checkpoint_at_end=True,
+    # Use CartPole-v1 by default.
+    env="CartPole-v1",
+    # Script only runs on new API stack.
+    enable_new_api_stack=True,
+)
+parser.add_argument(
+    "--explore-during-inference",
+    action="store_true",
+    help="Whether the trained policy should use exploration during action "
+    "inference.",
+)
+parser.add_argument(
+    "--num-episodes-during-inference",
+    type=int,
+    default=10,
+    help="Number of episodes to do inference over (after restoring from a checkpoint).",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    base_config = get_trainable_cls(args.algo).get_default_config()
+
+    print("Training policy until desired reward/timesteps/iterations. ...")
+    results = run_rllib_example_script_experiment(base_config, args)
+
+    print("Training completed. Restoring new RLModule for action inference.")
+    # Get the last checkpoint from the above training run.
+    best_result = results.get_best_result(
+        metric=f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}", mode="max"
+    )
+    # Create new RLModule and restore its state from the last algo checkpoint.
+    # Note that the checkpoint for the RLModule can be found deeper inside the algo
+    # checkpoint's subdirectories ([algo dir] -> "learner/" -> "module_state/" ->
+    # "[module ID]):
+    rl_module = RLModule.from_checkpoint(
+        os.path.join(
+            best_result.checkpoint.path,
+            "learner_group",
+            "learner",
+            "rl_module",
+            DEFAULT_MODULE_ID,
+        )
+    )
+
+    # Create an env to do inference in.
+    env = gym.make(args.env)
+    obs, info = env.reset()
+
+    num_episodes = 0
+    episode_return = 0.0
+
+    while num_episodes < args.num_episodes_during_inference:
+        # Compute an action using a B=1 observation "batch".
+        input_dict = {Columns.OBS: torch.from_numpy(obs).unsqueeze(0)}
+        # No exploration.
+        if not args.explore_during_inference:
+            rl_module_out = rl_module.forward_inference(input_dict)
+        # Using exploration.
+        else:
+            rl_module_out = rl_module.forward_exploration(input_dict)
+
+        # For discrete action spaces used here, normally, an RLModule "only"
+        # produces action logits, from which we then have to sample.
+        # However, you can also write custom RLModules that output actions
+        # directly, performing the sampling step already inside their
+        # `forward_...()` methods.
+        logits = convert_to_numpy(rl_module_out[Columns.ACTION_DIST_INPUTS])
+        # Perform the sampling step in numpy for simplicity.
+        action = np.random.choice(env.action_space.n, p=softmax(logits[0]))
+        # Send the computed action `a` to the env.
+        obs, reward, terminated, truncated, _ = env.step(action)
+        episode_return += reward
+        # Is the episode `done`? -> Reset.
+        if terminated or truncated:
+            print(f"Episode done: Total reward = {episode_return}")
+            obs, info = env.reset()
+            num_episodes += 1
+            episode_return = 0.0
+
+    print(f"Done performing action inference through {num_episodes} Episodes")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_w_connector.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_w_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..f19505e22dd4963a55157489bb3c26909501675f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_w_connector.py
@@ -0,0 +1,274 @@
+"""Example on how to compute actions in production on an already trained policy.
+
+This example uses a more complex setup including a gymnasium environment, an
+RLModule (one or more neural networks/policies), an env-to-module/module-to-env
+ConnectorV2 pair, and an Episode object to store the ongoing episode in.
+The RLModule contains an LSTM that requires its own previous STATE_OUT as new input
+at every episode step to compute a new action.
+
+This example:
+    - shows how to use an already existing checkpoint to extract a single-agent RLModule
+    from (our policy network).
+    - shows how to setup this recovered policy net for action computations (with or
+    without using exploration).
+    - shows how to create a more complex env-loop in which the action-computing RLModule
+    requires its own previous state outputs as new input and how to use RLlib's Episode
+    APIs to achieve this.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --stop-reward=200.0`
+
+Use the `--explore-during-inference` option to switch on exploratory behavior
+during inference. Normally, you should not explore during inference, though,
+unless your environment has a stochastic optimal solution.
+Use the `--num-episodes-during-inference=[int]` option to set the number of
+episodes to run through during the inference phase using the restored RLModule.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+Note that the shown GPU settings in this script also work in case you are not
+running via tune, but instead are using the `--no-tune` command line option.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+
+
+Results to expect
+-----------------
+
+For the training step - depending on your `--stop-reward` setting, you should see
+something similar to this:
+
+Number of trials: 1/1 (1 TERMINATED)
++--------------------------------+------------+-----------------+--------+
+| Trial name                     | status     | loc             |   iter |
+|                                |            |                 |        |
+|--------------------------------+------------+-----------------+--------+
+| PPO_stateless-cart_cc890_00000 | TERMINATED | 127.0.0.1:72238 |      7 |
++--------------------------------+------------+-----------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |   num_env_steps_sample |   num_env_steps_traine |
+|                  |             d_lifetime |             d_lifetime |
++------------------+------------------------+------------------------+
+|          31.9655 |                  28000 |                  28000 |
++------------------+------------------------+------------------------+
+
+Then, after restoring the RLModule for the inference phase, your output should
+look similar to:
+
+Training completed. Creating an env-loop for inference ...
+Env ...
+Env-to-module ConnectorV2 ...
+RLModule restored ...
+Module-to-env ConnectorV2 ...
+Episode done: Total reward = 103.0
+Episode done: Total reward = 90.0
+Episode done: Total reward = 100.0
+Episode done: Total reward = 111.0
+Episode done: Total reward = 85.0
+Episode done: Total reward = 90.0
+Episode done: Total reward = 100.0
+Episode done: Total reward = 102.0
+Episode done: Total reward = 97.0
+Episode done: Total reward = 81.0
+Done performing action inference through 10 Episodes
+"""
+import os
+
+from ray.rllib.connectors.env_to_module import EnvToModulePipeline
+from ray.rllib.connectors.module_to_env import ModuleToEnvPipeline
+from ray.rllib.core import (
+    COMPONENT_ENV_RUNNER,
+    COMPONENT_ENV_TO_MODULE_CONNECTOR,
+    COMPONENT_MODULE_TO_ENV_CONNECTOR,
+    COMPONENT_LEARNER_GROUP,
+    COMPONENT_LEARNER,
+    COMPONENT_RL_MODULE,
+    DEFAULT_MODULE_ID,
+)
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+torch, _ = try_import_torch()
+
+
+def _env_creator(cfg):
+    return StatelessCartPole(cfg)
+
+
+register_env("stateless-cart", _env_creator)
+
+
+parser = add_rllib_example_script_args(default_reward=200.0)
+parser.set_defaults(
+    # Script only runs on new API stack.
+    enable_new_api_stack=True,
+    # Make sure that - by default - we produce checkpoints during training.
+    checkpoint_freq=1,
+    checkpoint_at_end=True,
+    # Use StatelessCartPole by default.
+    env="stateless-cart",
+)
+parser.add_argument(
+    "--explore-during-inference",
+    action="store_true",
+    help="Whether the trained policy should use exploration during action "
+    "inference.",
+)
+parser.add_argument(
+    "--num-episodes-during-inference",
+    type=int,
+    default=10,
+    help="Number of episodes to do inference over (after restoring from a checkpoint).",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .training(
+            num_epochs=6,
+            lr=0.0003,
+            vf_loss_coeff=0.01,
+        )
+        # Add an LSTM setup to the default RLModule used.
+        .rl_module(model_config=DefaultModelConfig(use_lstm=True))
+    )
+
+    print("Training LSTM-policy until desired reward/timesteps/iterations. ...")
+    results = run_rllib_example_script_experiment(base_config, args)
+
+    # Get the last checkpoint from the above training run.
+    metric_key = metric = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    best_result = results.get_best_result(metric=metric_key, mode="max")
+
+    print(
+        "Training completed (R="
+        f"{best_result.metrics[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}). "
+        "Creating an env-loop for inference ..."
+    )
+
+    print("Env ...", end="")
+    env = _env_creator(base_config.env_config)
+    print(" ok")
+
+    # Create the env-to-module pipeline from the checkpoint.
+    print("Restore env-to-module connector from checkpoint ...", end="")
+    env_to_module = EnvToModulePipeline.from_checkpoint(
+        os.path.join(
+            best_result.checkpoint.path,
+            COMPONENT_ENV_RUNNER,
+            COMPONENT_ENV_TO_MODULE_CONNECTOR,
+        )
+    )
+    print(" ok")
+
+    print("Restore RLModule from checkpoint ...", end="")
+    # Create RLModule from a checkpoint.
+    rl_module = RLModule.from_checkpoint(
+        os.path.join(
+            best_result.checkpoint.path,
+            COMPONENT_LEARNER_GROUP,
+            COMPONENT_LEARNER,
+            COMPONENT_RL_MODULE,
+            DEFAULT_MODULE_ID,
+        )
+    )
+    print(" ok")
+
+    # For the module-to-env pipeline, we will use the convenient config utility.
+    print("Restore module-to-env connector from checkpoint ...", end="")
+    module_to_env = ModuleToEnvPipeline.from_checkpoint(
+        os.path.join(
+            best_result.checkpoint.path,
+            COMPONENT_ENV_RUNNER,
+            COMPONENT_MODULE_TO_ENV_CONNECTOR,
+        )
+    )
+    print(" ok")
+
+    # Now our setup is complete:
+    # [gym.Env] -> env-to-module -> [RLModule] -> module-to-env -> [gym.Env] ... repeat
+    num_episodes = 0
+
+    obs, _ = env.reset()
+    episode = SingleAgentEpisode(
+        observations=[obs],
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+    )
+
+    while num_episodes < args.num_episodes_during_inference:
+        shared_data = {}
+        input_dict = env_to_module(
+            episodes=[episode],  # ConnectorV2 pipelines operate on lists of episodes.
+            rl_module=rl_module,
+            explore=args.explore_during_inference,
+            shared_data=shared_data,
+        )
+        # No exploration.
+        if not args.explore_during_inference:
+            rl_module_out = rl_module.forward_inference(input_dict)
+        # Using exploration.
+        else:
+            rl_module_out = rl_module.forward_exploration(input_dict)
+
+        to_env = module_to_env(
+            batch=rl_module_out,
+            episodes=[episode],  # ConnectorV2 pipelines operate on lists of episodes.
+            rl_module=rl_module,
+            explore=args.explore_during_inference,
+            shared_data=shared_data,
+        )
+        # Send the computed action to the env. Note that the RLModule and the
+        # connector pipelines work on batched data (B=1 in this case), whereas the Env
+        # is not vectorized here, so we need to use `action[0]`.
+        action = to_env.pop(Columns.ACTIONS)[0]
+        obs, reward, terminated, truncated, _ = env.step(action)
+        # Keep our `SingleAgentEpisode` instance updated at all times.
+        episode.add_env_step(
+            obs,
+            action,
+            reward,
+            terminated=terminated,
+            truncated=truncated,
+            # Same here: [0] b/c RLModule output is batched (w/ B=1).
+            extra_model_outputs={k: v[0] for k, v in to_env.items()},
+        )
+
+        # Is the episode `done`? -> Reset.
+        if episode.is_done:
+            print(f"Episode done: Total reward = {episode.get_return()}")
+            obs, info = env.reset()
+            episode = SingleAgentEpisode(
+                observations=[obs],
+                observation_space=env.observation_space,
+                action_space=env.action_space,
+            )
+            num_episodes += 1
+
+    print(f"Done performing action inference through {num_episodes} Episodes")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_attention.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e594066d18f583219f665f081cd293dfd9825e0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_attention.py
@@ -0,0 +1,196 @@
+# @OldAPIStack
+"""
+Example showing how you can use your trained policy for inference
+(computing actions) in an environment.
+
+Includes options for LSTM-based models (--use-lstm), attention-net models
+(--use-attention), and plain (non-recurrent) models.
+"""
+import argparse
+import gymnasium as gym
+import numpy as np
+import os
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
+)
+parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--prev-n-actions",
+    type=int,
+    default=0,
+    help="Feed n most recent actions to the attention net as part of its input.",
+)
+parser.add_argument(
+    "--prev-n-rewards",
+    type=int,
+    default=0,
+    help="Feed n most recent rewards to the attention net as part of its input.",
+)
+parser.add_argument(
+    "--stop-iters",
+    type=int,
+    default=200,
+    help="Number of iterations to train before we do inference.",
+)
+parser.add_argument(
+    "--stop-timesteps",
+    type=int,
+    default=100000,
+    help="Number of timesteps to train before we do inference.",
+)
+parser.add_argument(
+    "--stop-reward",
+    type=float,
+    default=150.0,
+    help="Reward at which we stop training before we do inference.",
+)
+parser.add_argument(
+    "--explore-during-inference",
+    action="store_true",
+    help="Whether the trained policy should use exploration during action "
+    "inference.",
+)
+parser.add_argument(
+    "--num-episodes-during-inference",
+    type=int,
+    default=10,
+    help="Number of episodes to do inference over after training.",
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    ray.init(num_cpus=args.num_cpus or None)
+
+    config = (
+        get_trainable_cls(args.run)
+        .get_default_config()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .environment("FrozenLake-v1")
+        # Run with tracing enabled for tf2?
+        .framework(args.framework)
+        .training(
+            model={
+                "use_attention": True,
+                "attention_num_transformer_units": 1,
+                "attention_use_n_prev_actions": args.prev_n_actions,
+                "attention_use_n_prev_rewards": args.prev_n_rewards,
+                "attention_dim": 32,
+                "attention_memory_inference": 10,
+                "attention_memory_training": 10,
+            },
+        )
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+    )
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+
+    print("Training policy until desired reward/timesteps/iterations. ...")
+    tuner = tune.Tuner(
+        args.run,
+        param_space=config,
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=2,
+            checkpoint_config=air.CheckpointConfig(
+                checkpoint_frequency=1,
+                checkpoint_at_end=True,
+            ),
+        ),
+    )
+    results = tuner.fit()
+
+    print("Training completed. Restoring new Algorithm for action inference.")
+    # Get the last checkpoint from the above training run.
+    checkpoint = results.get_best_result().checkpoint
+    # Create new Algorithm and restore its state from the last checkpoint.
+    algo = Algorithm.from_checkpoint(checkpoint)
+
+    # Create the env to do inference in.
+    env = gym.make("FrozenLake-v1")
+    obs, info = env.reset()
+
+    # In case the model needs previous-reward/action inputs, keep track of
+    # these via these variables here (we'll have to pass them into the
+    # compute_actions methods below).
+    init_prev_a = prev_a = None
+    init_prev_r = prev_r = None
+
+    # Set attention net's initial internal state.
+    num_transformers = config["model"]["attention_num_transformer_units"]
+    memory_inference = config["model"]["attention_memory_inference"]
+    attention_dim = config["model"]["attention_dim"]
+    init_state = state = [
+        np.zeros([memory_inference, attention_dim], np.float32)
+        for _ in range(num_transformers)
+    ]
+    # Do we need prev-action/reward as part of the input?
+    if args.prev_n_actions:
+        init_prev_a = prev_a = np.array([0] * args.prev_n_actions)
+    if args.prev_n_rewards:
+        init_prev_r = prev_r = np.array([0.0] * args.prev_n_rewards)
+
+    num_episodes = 0
+
+    while num_episodes < args.num_episodes_during_inference:
+        # Compute an action (`a`).
+        a, state_out, _ = algo.compute_single_action(
+            observation=obs,
+            state=state,
+            prev_action=prev_a,
+            prev_reward=prev_r,
+            explore=args.explore_during_inference,
+            policy_id="default_policy",  # <- default value
+        )
+        # Send the computed action `a` to the env.
+        obs, reward, done, truncated, _ = env.step(a)
+        # Is the episode `done`? -> Reset.
+        if done:
+            obs, info = env.reset()
+            num_episodes += 1
+            state = init_state
+            prev_a = init_prev_a
+            prev_r = init_prev_r
+        # Episode is still ongoing -> Continue.
+        else:
+            # Append the just received state-out (most recent timestep) to the
+            # cascade (memory) of our state-ins and drop the oldest state-in.
+            state = [
+                np.concatenate([state[i], [state_out[i]]], axis=0)[1:]
+                for i in range(num_transformers)
+            ]
+            if init_prev_a is not None:
+                prev_a = a
+            if init_prev_r is not None:
+                prev_r = reward
+
+    algo.stop()
+
+    ray.shutdown()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_lstm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..39c6ac6aa58874f83c49a6a539feae1fda5189c0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_lstm.py
@@ -0,0 +1,185 @@
+# @OldAPIStack
+"""
+Example showing how you can use your trained policy for inference
+(computing actions) in an environment.
+
+Includes options for LSTM-based models (--use-lstm), attention-net models
+(--use-attention), and plain (non-recurrent) models.
+"""
+import argparse
+import gymnasium as gym
+import numpy as np
+import os
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
+)
+parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--prev-action",
+    action="store_true",
+    help="Feed most recent action to the LSTM as part of its input.",
+)
+parser.add_argument(
+    "--prev-reward",
+    action="store_true",
+    help="Feed most recent reward to the LSTM as part of its input.",
+)
+parser.add_argument(
+    "--stop-iters",
+    type=int,
+    default=2,
+    help="Number of iterations to train before we do inference.",
+)
+parser.add_argument(
+    "--stop-timesteps",
+    type=int,
+    default=100000,
+    help="Number of timesteps to train before we do inference.",
+)
+parser.add_argument(
+    "--stop-reward",
+    type=float,
+    default=0.8,
+    help="Reward at which we stop training before we do inference.",
+)
+parser.add_argument(
+    "--explore-during-inference",
+    action="store_true",
+    help="Whether the trained policy should use exploration during action "
+    "inference.",
+)
+parser.add_argument(
+    "--num-episodes-during-inference",
+    type=int,
+    default=10,
+    help="Number of episodes to do inference over after training.",
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    ray.init(num_cpus=args.num_cpus or None)
+
+    config = (
+        get_trainable_cls(args.run)
+        .get_default_config()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .environment("FrozenLake-v1")
+        # Run with tracing enabled for tf2?
+        .framework(args.framework)
+        .training(
+            model={
+                "use_lstm": True,
+                "lstm_cell_size": 256,
+                "lstm_use_prev_action": args.prev_action,
+                "lstm_use_prev_reward": args.prev_reward,
+            },
+        )
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+    )
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+
+    print("Training policy until desired reward/timesteps/iterations. ...")
+    tuner = tune.Tuner(
+        args.run,
+        param_space=config,
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=2,
+            checkpoint_config=air.CheckpointConfig(
+                checkpoint_frequency=1,
+                checkpoint_at_end=True,
+            ),
+        ),
+    )
+    results = tuner.fit()
+
+    print("Training completed. Restoring new Algorithm for action inference.")
+    # Get the last checkpoint from the above training run.
+    checkpoint = results.get_best_result().checkpoint
+    # Create new Algorithm from the last checkpoint.
+    algo = Algorithm.from_checkpoint(checkpoint)
+
+    # Create the env to do inference in.
+    env = gym.make("FrozenLake-v1")
+    obs, info = env.reset()
+
+    # In case the model needs previous-reward/action inputs, keep track of
+    # these via these variables here (we'll have to pass them into the
+    # compute_actions methods below).
+    init_prev_a = prev_a = None
+    init_prev_r = prev_r = None
+
+    # Set LSTM's initial internal state.
+    lstm_cell_size = config["model"]["lstm_cell_size"]
+    # range(2) b/c h- and c-states of the LSTM.
+    if algo.config.enable_rl_module_and_learner:
+        init_state = state = algo.get_policy().model.get_initial_state()
+    else:
+        init_state = state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)]
+    # Do we need prev-action/reward as part of the input?
+    if args.prev_action:
+        init_prev_a = prev_a = 0
+    if args.prev_reward:
+        init_prev_r = prev_r = 0.0
+
+    num_episodes = 0
+
+    while num_episodes < args.num_episodes_during_inference:
+        # Compute an action (`a`).
+        a, state_out, _ = algo.compute_single_action(
+            observation=obs,
+            state=state,
+            prev_action=prev_a,
+            prev_reward=prev_r,
+            explore=args.explore_during_inference,
+            policy_id="default_policy",  # <- default value
+        )
+        # Send the computed action `a` to the env.
+        obs, reward, done, truncated, info = env.step(a)
+        # Is the episode `done`? -> Reset.
+        if done:
+            obs, info = env.reset()
+            num_episodes += 1
+            state = init_state
+            prev_a = init_prev_a
+            prev_r = init_prev_r
+        # Episode is still ongoing -> Continue.
+        else:
+            state = state_out
+            if init_prev_a is not None:
+                prev_a = a
+            if init_prev_r is not None:
+                prev_r = reward
+
+    algo.stop()
+
+    ray.shutdown()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/quadx_waypoints.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/quadx_waypoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbd7082c92e0f289273c29abcb6f046880b94d65
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/quadx_waypoints.py
@@ -0,0 +1,131 @@
+"""An example showing how to use PyFlyt gymnasium environment to train a UAV to
+reach waypoints.
+
+For more infos about the PyFlyt gymnasium environment see the GitHub Repository:
+https://github.com/jjshoots/PyFlyt/tree/master/PyFlyt
+
+This example
+    - Runs a single-agent `PyFlyt/QuadX-Waypoints-v1` experiment.
+    - Uses a gymnasium reward wrapper for reward scaling.
+    - Stops the experiment, if either `--stop-iters` (default is 200) or
+        `--stop-reward` (default is 90.0) is reached.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Control the number of environments per `EnvRunner` via `--num-envs-per-env-runner`.
+This will increase sampling speed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0` which should allow you to set breakpoints
+anywhere in the RLlib code and have the execution stop there for inspection
+and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+"""
+import gymnasium as gym
+import sys
+
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    TRAINING_ITERATION_TIMER,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+sys.setrecursionlimit(3000)
+
+parser = add_rllib_example_script_args(
+    default_iters=200,
+    default_timesteps=100000,
+    default_reward=90.0,
+)
+parser.add_argument(
+    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
+)
+parser.add_argument("--env-name", type=str, default="quadx_waypoints")
+parser.add_argument("--num-envs-per-env-runner", type=int, default=4)
+
+
+class RewardWrapper(gym.RewardWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+
+    def reward(self, reward):
+        # Scale rewards:
+        if reward >= 99.0 or reward <= -99.0:
+            return reward / 10
+        return reward
+
+
+def create_quadx_waypoints_env(env_config):
+    import PyFlyt.gym_envs  # noqa
+    from PyFlyt.gym_envs import FlattenWaypointEnv
+
+    env = gym.make("PyFlyt/QuadX-Waypoints-v1")
+    # Wrap Environment to use max 10 and -10 for rewards
+    env = RewardWrapper(env)
+
+    return FlattenWaypointEnv(env, context_length=1)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Register the environment with tune.
+    register_env(args.env_name, env_creator=create_quadx_waypoints_env)
+
+    # Get the algorithm class to use for training.
+    algo_cls = get_trainable_cls(args.run)
+    config = (
+        algo_cls.get_default_config()
+        .environment(env=args.env_name)
+        .env_runners(
+            num_envs_per_env_runner=args.num_envs_per_env_runner,
+        )
+        .reporting(min_time_s_per_iteration=0.1)
+    )
+
+    # If PPO set additional configurations.
+    if args.run == "PPO":
+        config.rl_module(
+            model_config={
+                "fcnet_hiddens": [32],
+                "fcnet_activation": "linear",
+                "vf_share_layers": True,
+            }
+        )
+        config.training(
+            minibatch_size=128,
+            train_batch_size_per_learner=10000,
+        )
+    # If IMPALA set additional arguments.
+    elif args.run == "IMPALA":
+        config.env_runners(num_env_runners=2)
+        config.learners(num_gpus_per_learner=0)
+        config.training(vf_loss_coeff=0.01)
+
+    # Set the stopping arguments.
+    EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    stop = {
+        TRAINING_ITERATION_TIMER: args.stop_iters,
+        EPISODE_RETURN_MEAN_KEY: args.stop_reward,
+    }
+
+    # Run the experiment.
+    run_rllib_example_script_experiment(
+        config,
+        args,
+        stop=stop,
+        success_metric={
+            f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+        },
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/replay_buffer_api.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/replay_buffer_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d87a5ef5cd3bd0e7cf238ca23559c21780f28a8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/replay_buffer_api.py
@@ -0,0 +1,82 @@
+# @OldAPIStack
+
+# __sphinx_doc_replay_buffer_api_example_script_begin__
+"""Simple example of how to modify replay buffer behaviour.
+
+We modify DQN to utilize prioritized replay but supplying it with the
+PrioritizedMultiAgentReplayBuffer instead of the standard MultiAgentReplayBuffer.
+This is possible because DQN uses the DQN training iteration function,
+which includes and a priority update, given that a fitting buffer is provided.
+"""
+
+import argparse
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.dqn import DQNConfig
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME
+from ray.rllib.utils.replay_buffers.replay_buffer import StorageUnit
+
+tf1, tf, tfv = try_import_tf()
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=50, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    ray.init(num_cpus=args.num_cpus or None)
+
+    # This is where we add prioritized experiences replay
+    # The training iteration function that is used by DQN already includes a priority
+    # update step.
+    replay_buffer_config = {
+        "type": "MultiAgentPrioritizedReplayBuffer",
+        # Although not necessary, we can modify the default constructor args of
+        # the replay buffer here
+        "prioritized_replay_alpha": 0.5,
+        "storage_unit": StorageUnit.SEQUENCES,
+        "replay_burn_in": 20,
+        "zero_init_states": True,
+    }
+
+    config = (
+        DQNConfig()
+        .environment("CartPole-v1")
+        .framework(framework=args.framework)
+        .env_runners(num_env_runners=4)
+        .training(
+            model=dict(use_lstm=True, lstm_cell_size=64, max_seq_len=20),
+            replay_buffer_config=replay_buffer_config,
+        )
+    )
+
+    stop_config = {
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+        TRAINING_ITERATION: args.stop_iters,
+    }
+
+    results = tune.Tuner(
+        config.algo_class,
+        param_space=config,
+        run_config=air.RunConfig(stop=stop_config),
+    ).fit()
+
+    ray.shutdown()
+
+# __sphinx_doc_replay_buffer_api_example_script_end__