diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b9fcf62061fcd127d82e39682683ec9eff3b441
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2cbc69685b54de79d1648106ea6eaaa544a2e1b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0aa8e2197aad66c093af34b193dacb2d2d5f885
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04fb7a71026ce236f71b4d07e5b17ce775801870
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f0995f945759e04fb54548f80e15d86574143de
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0fdab5667db51eaf83de541baf54899cab36d68
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e868f479d3977ea062144e6f35bfc0cddc165716
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f10bb65798fb819a14e6bd06c3e935db6480acc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..abb9f21c3333e9ce75249104a1f18457d517cbfb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py
@@ -0,0 +1,109 @@
+"""Example on how to define and run with an RLModule with a dependent action space.
+
+This examples:
+    - Shows how to write a custom RLModule outputting autoregressive actions.
+    The RLModule class used here implements a prior distribution for the first couple
+    of actions and then uses the sampled actions to compute the parameters for and
+    sample from a posterior distribution.
+    - Shows how to configure a PPO algorithm to use the custom RLModule.
+    - Stops the training after 100k steps or when the mean episode return
+    exceeds -0.012 in evaluation, i.e. if the agent has learned to
+    synchronize its actions.
+
+For details on the environment used, take a look at the `CorrelatedActionsEnv`
+class. To receive an episode return over 100, the agent must learn how to synchronize
+its actions.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-env-runners 2`
+
+Control the number of `EnvRunner`s with the `--num-env-runners` flag. This
+will increase the sampling speed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should reach an episode return of better than -0.5 quickly through a simple PPO
+policy. The logic behind beating the env is roughly:
+
+OBS:  optimal a1:   r1:  optimal a2:   r2:
+-1      2            0      -1.0        0
+-0.5    1/2       -0.5   -0.5/-1.5      0
+0       1            0      -1.0        0
+0.5     0/1       -0.5   -0.5/-1.5      0
+1       0            0      -1.0        0
+
+Meaning, most of the time, you would receive a reward better than -0.5, but worse than
+0.0.
+
++--------------------------------------+------------+--------+------------------+
+| Trial name                           | status     |   iter |   total time (s) |
+|                                      |            |        |                  |
+|--------------------------------------+------------+--------+------------------+
+| PPO_CorrelatedActionsEnv_6660d_00000 | TERMINATED |     76 |          132.438 |
++--------------------------------------+------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|    episode_return_mean |   num_env_steps_sample |   ...env_steps_sampled |
+|                        |             d_lifetime |   _lifetime_throughput |
+|------------------------+------------------------+------------------------|
+|                  -0.43 |                 152000 |                1283.48 |
++------------------------+------------------------+------------------------+
+"""
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv
+from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import (
+    AutoregressiveActionsRLM,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+
+parser = add_rllib_example_script_args(
+    default_iters=1000,
+    default_timesteps=2000000,
+    default_reward=-0.45,
+)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.algo != "PPO":
+        raise ValueError(
+            "This example script only runs with PPO! Set --algo=PPO on the command "
+            "line."
+        )
+
+    base_config = (
+        PPOConfig()
+        .environment(CorrelatedActionsEnv)
+        .training(
+            train_batch_size_per_learner=2000,
+            num_epochs=12,
+            minibatch_size=256,
+            entropy_coeff=0.005,
+            lr=0.0003,
+        )
+        # Specify the RLModule class to be used.
+        .rl_module(
+            rl_module_spec=RLModuleSpec(module_class=AutoregressiveActionsRLM),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d28b3f622fdd5753f1f5ab3ef669faedd9db962
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py
@@ -0,0 +1,86 @@
+from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete
+
+from ray.tune.registry import register_env
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.examples.envs.classes.multi_agent import (
+    MultiAgentNestedSpaceRepeatAfterMeEnv,
+)
+from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import (
+    NestedSpaceRepeatAfterMeEnv,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+
+# Read in common example script command line arguments.
+parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Define env-to-module-connector pipeline for the new stack.
+    def _env_to_module_pipeline(env):
+        return FlattenObservations(multi_agent=args.num_agents > 0)
+
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv(
+                config=dict(c, **{"num_agents": args.num_agents})
+            ),
+        )
+    else:
+        register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c))
+
+    # Define the AlgorithmConfig used.
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "env",
+            env_config={
+                "space": Dict(
+                    {
+                        "a": Tuple(
+                            [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})]
+                        ),
+                        "b": Box(-10.0, 10.0, (2,)),
+                        "c": MultiDiscrete([3, 3]),
+                        "d": Discrete(2),
+                    }
+                ),
+                "episode_len": 100,
+            },
+        )
+        .env_runners(env_to_module_connector=_env_to_module_pipeline)
+        # No history in Env (bandit problem).
+        .training(
+            gamma=0.0,
+            lr=0.0005,
+        )
+    )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    # Fix some PPO-specific settings.
+    if args.algo == "PPO":
+        base_config.training(
+            # We don't want high entropy in this Env.
+            entropy_coeff=0.00005,
+            num_epochs=4,
+            vf_loss_coeff=0.01,
+        )
+
+    # Run everything as configured.
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8da4317de4a24a77d9bab9d823d646824c56f58e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1828ede5a40d36666bccd88d100d04fd43dc7880
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18274a8c64598fb13b5ea717fe320175d67a2cbd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc418a1fe230ca2b8c2615f4bf2f08257a421920
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbb39532e2a82c212a1880865a84773e737429cd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea038ef644619c93d8169f5aee66c103e33b5dab
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0997013693fead49f007582cd037cffd9cef1ca9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d211228568beee9d82c837c3dfd4f03ef5d4c74
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..907fc442cd515432fffb4a963db5b4211fcf97a3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b86bc505ff3f2e556fa588b872c6aa578ee3286
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae7f158afddd5b30e6ac2445575a1b18aa2e1704
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f865e3a8ae8f0b91161ca41449b3aa97e98776d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py
@@ -0,0 +1,92 @@
+from collections import Counter
+from typing import Any, List, Optional
+
+import gymnasium as gym
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.typing import EpisodeType
+
+
+class CountBasedCuriosity(ConnectorV2):
+    """Learner ConnectorV2 piece to compute intrinsic rewards based on obs counts.
+
+    Add this connector piece to your Learner pipeline, through your algo config:
+    ```
+    config.training(
+        learner_connector=lambda obs_sp, act_sp: CountBasedCuriosity()
+    )
+    ```
+
+    Intrinsic rewards are computed on the Learner side based on naive observation
+    counts, which is why this connector should only be used for simple environments
+    with a reasonable number of possible observations. The intrinsic reward for a given
+    timestep is:
+    r(i) = intrinsic_reward_coeff * (1 / C(obs(i)))
+    where C is the total (lifetime) count of the obs at timestep i.
+
+    The intrinsic reward is added to the extrinsic reward and saved back into the
+    episode (under the main "rewards" key).
+
+    Note that the computation and saving back to the episode all happens before the
+    actual train batch is generated from the episode data. Thus, the Learner and the
+    RLModule used do not take notice of the extra reward added.
+
+    If you would like to use a more sophisticated mechanism for intrinsic reward
+    computations, take a look at the `EuclidianDistanceBasedCuriosity` connector piece
+    at `ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity`
+    """
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        intrinsic_reward_coeff: float = 1.0,
+        **kwargs,
+    ):
+        """Initializes a CountBasedCuriosity instance.
+
+        Args:
+            intrinsic_reward_coeff: The weight with which to multiply the intrinsic
+                reward before adding (and saving) it back to the main (extrinsic)
+                reward of the episode at each timestep.
+        """
+        super().__init__(input_observation_space, input_action_space)
+
+        # Naive observation counter.
+        self._counts = Counter()
+        self.intrinsic_reward_coeff = intrinsic_reward_coeff
+
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Any,
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Loop through all episodes and change the reward to
+        # [reward + intrinsic reward]
+        for sa_episode in self.single_agent_episode_iterator(
+            episodes=episodes, agents_that_stepped_only=False
+        ):
+            # Loop through all obs, except the last one.
+            observations = sa_episode.get_observations(slice(None, -1))
+            # Get all respective (extrinsic) rewards.
+            rewards = sa_episode.get_rewards()
+
+            for i, (obs, rew) in enumerate(zip(observations, rewards)):
+                obs = tuple(obs)
+                # Add 1 to obs counter.
+                self._counts[obs] += 1
+                # Compute our count-based intrinsic reward and add it to the main
+                # (extrinsic) reward.
+                rew += self.intrinsic_reward_coeff * (1 / self._counts[obs])
+                # Store the new reward back to the episode (under the correct
+                # timestep/index).
+                sa_episode.set_rewards(new_data=rew, at_indices=i)
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50a2caae5d744e40a95f1c78608ca0f99050398
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py
@@ -0,0 +1,122 @@
+from collections import deque
+from typing import Any, List, Optional
+
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.typing import EpisodeType
+
+
+class EuclidianDistanceBasedCuriosity(ConnectorV2):
+    """Learner ConnectorV2 piece computing intrinsic rewards with euclidian distance.
+
+    Add this connector piece to your Learner pipeline, through your algo config:
+    ```
+    config.training(
+        learner_connector=lambda obs_sp, act_sp: EuclidianDistanceBasedCuriosity()
+    )
+    ```
+
+    Intrinsic rewards are computed on the Learner side based on comparing the euclidian
+    distance of observations vs already seen ones. A configurable number of observations
+    will be stored in a FIFO buffer and all incoming observations have their distance
+    measured against those.
+
+    The minimum distance measured is the intrinsic reward for the incoming obs
+    (multiplied by a fixed coeffieicnt and added to the "main" extrinsic reward):
+    r(i) = intrinsic_reward_coeff * min(ED(o, o(i)) for o in stored_obs))
+    where `ED` is the euclidian distance and `stored_obs` is the buffer.
+
+    The intrinsic reward is then added to the extrinsic reward and saved back into the
+    episode (under the main "rewards" key).
+
+    Note that the computation and saving back to the episode all happens before the
+    actual train batch is generated from the episode data. Thus, the Learner and the
+    RLModule used do not take notice of the extra reward added.
+
+    Only one observation per incoming episode will be stored as a new one in the buffer.
+    Thereby, we pick the observation with the largest `min(ED)` value over all already
+    stored observations to be stored per episode.
+
+    If you would like to use a simpler, count-based mechanism for intrinsic reward
+    computations, take a look at the `CountBasedCuriosity` connector piece
+    at `ray.rllib.examples.connectors.classes.count_based_curiosity`
+    """
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        intrinsic_reward_coeff: float = 1.0,
+        max_buffer_size: int = 100,
+        **kwargs,
+    ):
+        """Initializes a CountBasedCuriosity instance.
+
+        Args:
+            intrinsic_reward_coeff: The weight with which to multiply the intrinsic
+                reward before adding (and saving) it back to the main (extrinsic)
+                reward of the episode at each timestep.
+        """
+        super().__init__(input_observation_space, input_action_space)
+
+        # Create an observation buffer
+        self.obs_buffer = deque(maxlen=max_buffer_size)
+        self.intrinsic_reward_coeff = intrinsic_reward_coeff
+
+        self._test = 0
+
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Any,
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        if self._test > 10:
+            return batch
+        self._test += 1
+        # Loop through all episodes and change the reward to
+        # [reward + intrinsic reward]
+        for sa_episode in self.single_agent_episode_iterator(
+            episodes=episodes, agents_that_stepped_only=False
+        ):
+            # Loop through all obs, except the last one.
+            observations = sa_episode.get_observations(slice(None, -1))
+            # Get all respective (extrinsic) rewards.
+            rewards = sa_episode.get_rewards()
+
+            max_dist_obs = None
+            max_dist = float("-inf")
+            for i, (obs, rew) in enumerate(zip(observations, rewards)):
+                # Compare obs to all stored observations and compute euclidian distance.
+                min_dist = 0.0
+                if self.obs_buffer:
+                    min_dist = min(
+                        np.sqrt(np.sum((obs - stored_obs) ** 2))
+                        for stored_obs in self.obs_buffer
+                    )
+                if min_dist > max_dist:
+                    max_dist = min_dist
+                    max_dist_obs = obs
+
+                # Compute our euclidian distance-based intrinsic reward and add it to
+                # the main (extrinsic) reward.
+                rew += self.intrinsic_reward_coeff * min_dist
+                # Store the new reward back to the episode (under the correct
+                # timestep/index).
+                sa_episode.set_rewards(new_data=rew, at_indices=i)
+
+            # Add the one observation of this episode with the largest (min) euclidian
+            # dist to all already stored obs to the buffer (maybe throwing out the
+            # oldest obs in there).
+            if max_dist_obs is not None:
+                self.obs_buffer.append(max_dist_obs)
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed4a891afcd4e64cb50148446ecca5cc1e1e06c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py
@@ -0,0 +1,80 @@
+from typing import Any, List, Optional
+
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.examples.envs.classes.utils.cartpole_observations_proto import (
+    CartPoleObservation,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+
+
+class ProtobufCartPoleObservationDecoder(ConnectorV2):
+    """Env-to-module ConnectorV2 piece decoding protobuf obs into CartPole-v1 obs.
+
+    Add this connector piece to your env-to-module pipeline, through your algo config:
+    ```
+    config.env_runners(
+        env_to_module_connector=lambda env: ProtobufCartPoleObservationDecoder()
+    )
+    ```
+
+    The incoming observation space must be a 1D Box of dtype uint8
+    (which is the same as a binary string). The outgoing observation space is the
+    normal CartPole-v1 1D space: Box(-inf, inf, (4,), float32).
+    """
+
+    @override(ConnectorV2)
+    def recompute_output_observation_space(
+        self,
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+    ) -> gym.Space:
+        # Make sure the incoming observation space is a protobuf (binary string).
+        assert (
+            isinstance(input_observation_space, gym.spaces.Box)
+            and len(input_observation_space.shape) == 1
+            and input_observation_space.dtype.name == "uint8"
+        )
+        # Return CartPole-v1's natural observation space.
+        return gym.spaces.Box(float("-inf"), float("inf"), (4,), np.float32)
+
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Any,
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Loop through all episodes and change the observation from a binary string
+        # to an actual 1D np.ndarray (normal CartPole-v1 obs).
+        for sa_episode in self.single_agent_episode_iterator(episodes=episodes):
+            # Get last obs (binary string).
+            obs = sa_episode.get_observations(-1)
+            obs_bytes = obs.tobytes()
+            obs_protobuf = CartPoleObservation()
+            obs_protobuf.ParseFromString(obs_bytes)
+
+            # Set up the natural CartPole-v1 observation tensor from the protobuf
+            # values.
+            new_obs = np.array(
+                [
+                    obs_protobuf.x_pos,
+                    obs_protobuf.x_veloc,
+                    obs_protobuf.angle_pos,
+                    obs_protobuf.angle_veloc,
+                ],
+                np.float32,
+            )
+
+            # Write the new observation (1D tensor) back into the Episode.
+            sa_episode.set_observations(new_data=new_obs, at_indices=-1)
+
+        # Return `data` as-is.
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad09e4ceb6bf2665d0766ebe218d071ca2ada19f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py
@@ -0,0 +1,14 @@
+"""Placeholder for training with count-based curiosity.
+
+The actual script can be found at a different location (see code below).
+"""
+
+if __name__ == "__main__":
+    import subprocess
+    import sys
+
+    # Forward to "python ../curiosity/[same script name].py [same options]"
+    command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]
+
+    # Run the script.
+    subprocess.run(command, capture_output=True)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e52de76791304545eebcf1aea76fe93cb2c6f39
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py
@@ -0,0 +1,14 @@
+"""Placeholder for training with euclidian distance-based curiosity.
+
+The actual script can be found at a different location (see code below).
+"""
+
+if __name__ == "__main__":
+    import subprocess
+    import sys
+
+    # Forward to "python ../curiosity/[same script name].py [same options]"
+    command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]
+
+    # Run the script.
+    subprocess.run(command, capture_output=True)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..564df75c6b9d76cd86e260556609140a6adf47d0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py
@@ -0,0 +1,154 @@
+"""Example using a ConnectorV2 to flatten arbitrarily nested dict or tuple observations.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `FlattenObservation` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - demonstrates that by using this connector, any arbitrarily nested dict or tuple
+    observations is properly flattened into a simple 1D tensor, for easier RLModule
+    processing.
+    - shows how - in a multi-agent setup - individual agents can be specified, whose
+    observations should be flattened (while other agents' observations will always
+    be left as-is).
+    - uses a variant of the CartPole-v1 environment, in which the 4 observation items
+    (x-pos, x-veloc, angle, and angle-veloc) are taken apart and put into a nested dict
+    with the structure:
+    {
+        "x-pos": [x-pos],
+        "angular-pos": {
+            "value": [angle],
+            "some_random_stuff": [random Discrete(3)],  # <- should be ignored by algo
+        },
+        "velocs": Tuple([x-veloc], [angle-veloc]),
+    }
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+
++---------------------+------------+----------------+--------+------------------+
+| Trial name          | status     | loc            |   iter |   total time (s) |
+|                     |            |                |        |                  |
+|---------------------+------------+----------------+--------+------------------+
+| PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 |     25 |          24.1426 |
++---------------------+------------+----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|   num_env_steps_sample |   num_env_steps_traine |   episode_return_mean  |
+|             d_lifetime |             d_lifetime |                        |
++------------------------+------------------------+------------------------|
+|                 100000 |                 100000 |                 421.42 |
++------------------------+------------------------+------------------------+
+"""
+from ray.tune.registry import register_env
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import (
+    CartPoleWithDictObservationSpace,
+)
+from ray.rllib.examples.envs.classes.multi_agent import (
+    MultiAgentCartPoleWithDictObservationSpace,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+
+# Read in common example script command line arguments.
+parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Define env-to-module-connector pipeline for the new stack.
+    def _env_to_module_pipeline(env):
+        return FlattenObservations(multi_agent=args.num_agents > 0)
+
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda _: MultiAgentCartPoleWithDictObservationSpace(
+                config={"num_agents": args.num_agents}
+            ),
+        )
+    else:
+        register_env("env", lambda _: CartPoleWithDictObservationSpace())
+
+    # Define the AlgorithmConfig used.
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("env")
+        .env_runners(env_to_module_connector=_env_to_module_pipeline)
+        .training(
+            gamma=0.99,
+            lr=0.0003,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                fcnet_hiddens=[32],
+                fcnet_activation="linear",
+                vf_share_layers=True,
+            ),
+        )
+    )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    # PPO-specific settings (for better learning behavior only).
+    if args.algo == "PPO":
+        base_config.training(
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+        )
+    # IMPALA-specific settings (for better learning behavior only).
+    elif args.algo == "IMPALA":
+        base_config.training(
+            lr=0.0005,
+            vf_loss_coeff=0.05,
+            entropy_coeff=0.0,
+        )
+
+    # Run everything as configured.
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py
new file mode 100644
index 0000000000000000000000000000000000000000..a22868c374cfa6cca835c0efd636f38d21f85c87
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py
@@ -0,0 +1,228 @@
+"""Example using 2 ConnectorV2 for observation frame-stacking in Atari environments.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
+    learner connector pipeline.
+    - demonstrates that using these two pieces (rather than performing framestacking
+    already inside the environment using a gymnasium wrapper) increases overall
+    performance by about 5%.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+
+With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
+and learner connector pipelines), you should see something like this using:
+`--env ALE/Pong-v5 --num-learners=4 --num-gpus-per-learner=1 --num-env-runners=95`
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |    200 |          335.837 |
++---------------------------+------------+--------+------------------+...
+
+Note that the time to run these 200 iterations is about ~5% faster than when
+performing framestacking already inside the environment (using a
+`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
+needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
+
+Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal),
+the output looks like this:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |    200 |          351.505 |
++---------------------------+------------+--------+------------------+...
+"""
+import gymnasium as gym
+
+from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
+from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
+from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+# Read in common example script command line arguments.
+parser = add_rllib_example_script_args(
+    default_timesteps=5000000, default_reward=20.0, default_iters=200
+)
+# Use Pong by default.
+parser.set_defaults(
+    enable_new_api_stack=True,
+    env="ale_py:ALE/Pong-v5",
+)
+parser.add_argument(
+    "--num-frames",
+    type=int,
+    default=4,
+    help="The number of observation frames to stack.",
+)
+parser.add_argument(
+    "--use-gym-wrapper-framestacking",
+    action="store_true",
+    help="Whether to use RLlib's Atari wrapper's framestacking capabilities (as "
+    "opposed to doing it via a specific ConenctorV2 pipeline).",
+)
+
+
+if __name__ == "__main__":
+    from ray import tune
+
+    args = parser.parse_args()
+
+    # Define our custom connector pipelines.
+    def _make_env_to_module_connector(env):
+        # Create the env-to-module connector. We return an individual connector piece
+        # here, which RLlib automatically integrates into a pipeline (and
+        # add its default connector piece to the end of that pipeline).
+        # The default pipeline automatically fixes the input- and output spaces of the
+        # individual connector pieces in it.
+        # Note that since the frame stacking connector does NOT write information
+        # back to the episode (in order to save memory and network traffic), we
+        # also need to perform the same procedure on the Learner end (see below
+        # where we set up the Learner pipeline).
+        return FrameStackingEnvToModule(
+            num_frames=args.num_frames,
+            multi_agent=args.num_agents > 0,
+        )
+
+    def _make_learner_connector(input_observation_space, input_action_space):
+        # Create the learner connector.
+        return FrameStackingLearner(
+            num_frames=args.num_frames,
+            multi_agent=args.num_agents > 0,
+        )
+
+    # Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it).
+    # We would like our frame stacking connector to do this job.
+    def _env_creator(cfg):
+        return wrap_atari_for_new_api_stack(
+            gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}),
+            # Perform framestacking either through ConnectorV2 or right here through
+            # the observation wrapper.
+            framestack=(
+                args.num_frames if args.use_gym_wrapper_framestacking else None
+            ),
+        )
+
+    if args.num_agents > 0:
+        tune.register_env(
+            "atari-env",
+            lambda cfg: make_multi_agent(_env_creator)(
+                dict(cfg, **{"num_agents": args.num_agents})
+            ),
+        )
+    else:
+        tune.register_env("atari-env", _env_creator)
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "atari-env",
+            env_config={
+                # Make analogous to old v4 + NoFrameskip.
+                "frameskip": 1,
+                "full_action_space": False,
+                "repeat_action_probability": 0.0,
+            },
+            clip_rewards=True,
+        )
+        .env_runners(
+            # ... new EnvRunner and our frame stacking env-to-module connector.
+            env_to_module_connector=(
+                None
+                if args.use_gym_wrapper_framestacking
+                else _make_env_to_module_connector
+            ),
+            num_envs_per_env_runner=1 if args.num_agents > 0 else 2,
+        )
+        .training(
+            # Use our frame stacking learner connector.
+            learner_connector=(
+                None if args.use_gym_wrapper_framestacking else _make_learner_connector
+            ),
+            entropy_coeff=0.01,
+            # Linearly adjust learning rate based on number of GPUs.
+            lr=0.00015 * (args.num_learners or 1),
+            grad_clip=100.0,
+            grad_clip_by="global_norm",
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                vf_share_layers=True,
+                conv_filters=[(16, 4, 2), (32, 4, 2), (64, 4, 2), (128, 4, 2)],
+                conv_activation="relu",
+                head_fcnet_hiddens=[256],
+            ),
+        )
+    )
+
+    # PPO specific settings.
+    if args.algo == "PPO":
+        base_config.training(
+            num_epochs=10,
+            minibatch_size=64,
+            lambda_=0.95,
+            kl_coeff=0.5,
+            clip_param=0.1,
+            vf_clip_param=10.0,
+        )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    # Run everything as configured.
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaccbf02cddbbd88eb3b341edcfc3d0777744b09
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py
@@ -0,0 +1,198 @@
+"""Example using a ConnectorV2 for processing observations with a mean/std filter.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `MeanStdFilter` ConnectorV2 piece can be added to the env-to-module
+    pipeline.
+    - demonstrates that using such a filter enhances learning behavior (or even makes
+    if possible to learn overall) in some environments, especially those with lopsided
+    observation spaces, for example `Box(-3000, -1000, ...)`.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+Running this example with the mean-std filter results in the normally expected Pendulum
+learning behavior:
++-------------------------------+------------+-----------------+--------+
+| Trial name                    | status     | loc             |   iter |
+|                               |            |                 |        |
+|-------------------------------+------------+-----------------+--------+
+| PPO_lopsided-pend_f9c96_00000 | TERMINATED | 127.0.0.1:43612 |     77 |
++-------------------------------+------------+-----------------+--------+
++------------------+------------------------+-----------------------+
+|   total time (s) |   num_env_steps_sample |   episode_return_mean |
+|                  |             d_lifetime |                       |
+|------------------+------------------------+-----------------------|
+|          30.7466 |                  40040 |                -276.3 |
++------------------+------------------------+-----------------------+
+
+If you try using the `--disable-mean-std-filter` (all other things being equal), you
+will either see no learning progress at all (or a very slow one), but more likely some
+numerical instability related error will be thrown:
+
+ValueError: Expected parameter loc (Tensor of shape (64, 1)) of distribution
+            Normal(loc: torch.Size([64, 1]), scale: torch.Size([64, 1])) to satisfy the
+            constraint Real(), but found invalid values:
+tensor([[nan],
+        [nan],
+        [nan],
+        ...
+"""
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+torch, _ = try_import_torch()
+
+parser = add_rllib_example_script_args(
+    default_iters=500,
+    default_timesteps=500000,
+    default_reward=-300.0,
+)
+parser.add_argument(
+    "--disable-mean-std-filter",
+    action="store_true",
+    help="Run w/o a mean/std env-to-module connector piece (filter).",
+)
+
+
+class LopsidedObs(gym.ObservationWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self.observation_space = gym.spaces.Box(-4000.0, -1456.0, (3,), np.float32)
+
+    def observation(self, observation):
+        # Lopside [-1.0, 1.0] Pendulum observations
+        return ((observation + 1.0) / 2.0) * (4000.0 - 1456.0) - 4000.0
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "lopsided-pend",
+            lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}),
+        )
+    else:
+        register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1")))
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("lopsided-pend")
+        .env_runners(
+            # TODO (sven): MAEnvRunner does not support vectorized envs yet
+            #  due to gym's env checkers and non-compatability with RLlib's
+            #  MultiAgentEnv API.
+            num_envs_per_env_runner=1 if args.num_agents > 0 else 20,
+            # Define a single connector piece to be prepended to the env-to-module
+            # connector pipeline.
+            # Alternatively, return a list of n ConnectorV2 pieces (which will then be
+            # included in an automatically generated EnvToModulePipeline or return a
+            # EnvToModulePipeline directly.
+            env_to_module_connector=(
+                None
+                if args.disable_mean_std_filter
+                else lambda env: MeanStdFilter(multi_agent=args.num_agents > 0)
+            ),
+        )
+        .training(
+            train_batch_size_per_learner=512,
+            gamma=0.95,
+            # Linearly adjust learning rate based on number of GPUs.
+            lr=0.0003 * (args.num_learners or 1),
+            vf_loss_coeff=0.01,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                fcnet_activation="relu",
+                fcnet_kernel_initializer=torch.nn.init.xavier_uniform_,
+                fcnet_bias_initializer=torch.nn.init.constant_,
+                fcnet_bias_initializer_kwargs={"val": 0.0},
+            ),
+        )
+        # In case you would like to run with a evaluation EnvRunners, make sure your
+        # `evaluation_config` key contains the `use_worker_filter_stats=False` setting
+        # (see below). This setting makes sure that the mean/std stats collected by the
+        # evaluation EnvRunners are NOT used for the training EnvRunners (unless you
+        # really want to mix these stats). It's normally a good idea to keep the stats
+        # collected during evaluation completely out of the training data (already for
+        # better reproducibility alone).
+        # .evaluation(
+        #    evaluation_num_env_runners=1,
+        #    evaluation_interval=1,
+        #    evaluation_config={
+        #        "explore": False,
+        #        # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before
+        #        # each round of evaluation, broadcast the latest training
+        #        # EnvRunnerGroup's ConnectorV2 states (merged from all training remote
+        #        # EnvRunners) to the eval EnvRunnerGroup (and discard the eval
+        #        # EnvRunners' stats).
+        #        "use_worker_filter_stats": False,
+        #    },
+        # )
+    )
+
+    # PPO specific settings.
+    if args.algo == "PPO":
+        base_config.training(
+            minibatch_size=64,
+            lambda_=0.1,
+            vf_clip_param=10.0,
+        )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fa1e6681b90dfcad83b3f765599d8dd4f724ace
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py
@@ -0,0 +1,164 @@
+"""Example using a ConnectorV2 to add previous rewards/actions to an RLModule's input.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `PrevActionsPrevRewards` ConnectorV2 piece can be added to the
+    env-to-module pipeline to extract previous rewards and/or actions from the ongoing
+    episodes.
+    - shows how this connector creates  and wraps this new information (rewards and
+    actions) together with the original observations into the RLModule's input dict
+    under a new `gym.spaces.Dict` structure (for example, if your observation space
+    is `O=Box(shape=(3,))` and you add the most recent 1 reward, the new observation
+    space will be `Dict({"_original_obs": O, "prev_n_rewards": Box(shape=())})`.
+    - demonstrates how to use RLlib's `FlattenObservations` right after the
+    `PrevActionsPrevRewards` to flatten that new dict observation structure again into
+    a single 1D tensor.
+    - uses the StatelessCartPole environment, a CartPole-v1 derivative that's missing
+    both x-veloc and angle-veloc observation components and is therefore non-Markovian
+    (only partially observable). An LSTM default model is used for training. Adding
+    the additional context to the observations (for example, prev. actions) helps the
+    LSTM to more quickly learn in this environment.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+
+You should see something similar to this in your terminal output when running
+ths script as described above:
+
++---------------------+------------+-----------------+--------+------------------+
+| Trial name          | status     | loc             |   iter |   total time (s) |
+|                     |            |                 |        |                  |
+|---------------------+------------+-----------------+--------+------------------+
+| PPO_env_0edd2_00000 | TERMINATED | 127.0.0.1:12632 |     17 |          42.6898 |
++---------------------+------------+-----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|   num_env_steps_sample |   num_env_steps_traine |   episode_return_mean  |
+|             d_lifetime |             d_lifetime |                        |
+|------------------------+------------------------+------------------------|
+|                  68000 |                  68000 |                 205.22 |
++------------------------+------------------------+------------------------+
+"""
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.connectors.env_to_module import (
+    FlattenObservations,
+    PrevActionsPrevRewards,
+)
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune import register_env
+
+torch, nn = try_import_torch()
+
+
+parser = add_rllib_example_script_args(
+    default_reward=200.0, default_timesteps=1000000, default_iters=2000
+)
+parser.set_defaults(enable_new_api_stack=True)
+parser.add_argument("--n-prev-rewards", type=int, default=1)
+parser.add_argument("--n-prev-actions", type=int, default=1)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Define our custom connector pipelines.
+    def _env_to_module(env):
+        # Create the env-to-module connector pipeline.
+        return [
+            PrevActionsPrevRewards(
+                multi_agent=args.num_agents > 0,
+                n_prev_rewards=args.n_prev_rewards,
+                n_prev_actions=args.n_prev_actions,
+            ),
+            FlattenObservations(multi_agent=args.num_agents > 0),
+        ]
+
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda _: MultiAgentStatelessCartPole(
+                config={"num_agents": args.num_agents}
+            ),
+        )
+    else:
+        register_env("env", lambda _: StatelessCartPole())
+
+    config = (
+        PPOConfig()
+        .environment("env")
+        .env_runners(env_to_module_connector=_env_to_module)
+        .training(
+            num_epochs=6,
+            lr=0.0003,
+            train_batch_size=4000,
+            vf_loss_coeff=0.01,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                use_lstm=True,
+                max_seq_len=20,
+                fcnet_hiddens=[32],
+                fcnet_activation="linear",
+                fcnet_kernel_initializer=nn.init.xavier_uniform_,
+                fcnet_bias_initializer=nn.init.constant_,
+                fcnet_bias_initializer_kwargs={"val": 0.0},
+                vf_share_layers=True,
+            ),
+        )
+    )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        config = config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    run_rllib_example_script_experiment(config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2a796a08a3778d56c4b4dad7a9b08154abc896
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py
@@ -0,0 +1,230 @@
+"""Example of customizing the evaluation procedure for an RLlib Algorithm.
+
+Note, that you should only choose to provide a custom eval function, in case the already
+built-in eval options are not sufficient. Normally, though, RLlib's eval utilities
+that come with each Algorithm are enough to properly evaluate the learning progress
+of your Algorithm.
+
+This script uses the SimpleCorridor environment, a simple 1D gridworld, in which
+the agent can only walk left (action=0) or right (action=1). The goal state is located
+at the end of the (1D) corridor. The env exposes an API to change the length of the
+corridor on-the-fly. We use this API here to extend the size of the corridor for the
+evaluation runs.
+
+For demonstration purposes only, we define a simple custom evaluation method that does
+the following:
+- It changes the corridor length of all environments used on the evaluation EnvRunners.
+- It runs a defined number of episodes for evaluation purposes.
+- It collects the metrics from those runs, summarizes these metrics and returns them.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+
+You can switch off custom evaluation (and use RLlib's default evaluation procedure)
+with the `--no-custom-eval` flag.
+
+You can switch on parallel evaluation to training using the
+`--evaluation-parallel-to-training` flag. See this example script here:
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py  # noqa
+for more details on running evaluation parallel to training.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see the following (or very similar) console output when running this script.
+Note that for each iteration, due to the definition of our custom evaluation function,
+we run 3 evaluation rounds per single training round.
+
+...
+Training iteration 1 -> evaluation round 0
+Training iteration 1 -> evaluation round 1
+Training iteration 1 -> evaluation round 2
+...
+...
++--------------------------------+------------+-----------------+--------+
+| Trial name                     | status     | loc             |   iter |
+|--------------------------------+------------+-----------------+--------+
+| PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 |      4 |
++--------------------------------+------------+-----------------+--------+
++------------------+-------+----------+--------------------+
+|   total time (s) |    ts |   reward |   episode_len_mean |
+|------------------+-------+----------+--------------------|
+|          26.1973 | 16000 | 0.872034 |            13.7966 |
++------------------+-------+----------+--------------------+
+"""
+from typing import Tuple
+
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.env.env_runner_group import EnvRunnerGroup
+from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EVALUATION_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.rllib.utils.typing import ResultDict
+from ray.tune.registry import get_trainable_cls
+
+
+parser = add_rllib_example_script_args(
+    default_iters=50, default_reward=0.7, default_timesteps=50000
+)
+parser.add_argument("--no-custom-eval", action="store_true")
+parser.add_argument("--corridor-length-training", type=int, default=10)
+parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20)
+parser.add_argument("--corridor-length-eval-worker-2", type=int, default=30)
+
+
+def custom_eval_function(
+    algorithm: Algorithm,
+    eval_workers: EnvRunnerGroup,
+) -> Tuple[ResultDict, int, int]:
+    """Example of a custom evaluation function.
+
+    Args:
+        algorithm: Algorithm class to evaluate.
+        eval_workers: Evaluation EnvRunnerGroup.
+
+    Returns:
+        metrics: Evaluation metrics dict.
+    """
+    # Set different env settings for each (eval) EnvRunner. Here we use the EnvRunner's
+    # `worker_index` property to figure out the actual length.
+    # Loop through all workers and all sub-envs (gym.Env) on each worker and call the
+    # `set_corridor_length` method on these.
+    eval_workers.foreach_env_runner(
+        func=lambda worker: (
+            env.unwrapped.set_corridor_length(
+                args.corridor_length_eval_worker_1
+                if worker.worker_index == 1
+                else args.corridor_length_eval_worker_2
+            )
+            for env in worker.env.unwrapped.envs
+        )
+    )
+
+    # Collect metrics results collected by eval workers in this list for later
+    # processing.
+    env_runner_metrics = []
+    sampled_episodes = []
+    # For demonstration purposes, run through some number of evaluation
+    # rounds within this one call. Note that this function is called once per
+    # training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()`
+    # (which can be called manually by the user).
+    for i in range(3):
+        print(f"Training iteration {algorithm.iteration} -> evaluation round {i}")
+        # Sample episodes from the EnvRunners AND have them return only the thus
+        # collected metrics.
+        episodes_and_metrics_all_env_runners = eval_workers.foreach_env_runner(
+            # Return only the metrics, NOT the sampled episodes (we don't need them
+            # anymore).
+            func=lambda worker: (worker.sample(), worker.get_metrics()),
+            local_env_runner=False,
+        )
+        sampled_episodes.extend(
+            eps
+            for eps_and_mtrcs in episodes_and_metrics_all_env_runners
+            for eps in eps_and_mtrcs[0]
+        )
+        env_runner_metrics.extend(
+            eps_and_mtrcs[1] for eps_and_mtrcs in episodes_and_metrics_all_env_runners
+        )
+
+    # You can compute metrics from the episodes manually, or use the Algorithm's
+    # convenient MetricsLogger to store all evaluation metrics inside the main
+    # algo.
+    algorithm.metrics.merge_and_log_n_dicts(
+        env_runner_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS)
+    )
+    eval_results = algorithm.metrics.reduce(
+        key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS)
+    )
+    # Alternatively, you could manually reduce over the n returned `env_runner_metrics`
+    # dicts, but this would be much harder as you might not know, which metrics
+    # to sum up, which ones to average over, etc..
+
+    # Compute env and agent steps from sampled episodes.
+    env_steps = sum(eps.env_steps() for eps in sampled_episodes)
+    agent_steps = sum(eps.agent_steps() for eps in sampled_episodes)
+
+    return eval_results, env_steps, agent_steps
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    args.local_mode = True
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        # For training, we use a corridor length of n. For evaluation, we use different
+        # values, depending on the eval worker index (1 or 2).
+        .environment(
+            SimpleCorridor,
+            env_config={"corridor_length": args.corridor_length_training},
+        )
+        .evaluation(
+            # Do we use the custom eval function defined above?
+            custom_evaluation_function=(
+                None if args.no_custom_eval else custom_eval_function
+            ),
+            # Number of eval EnvRunners to use.
+            evaluation_num_env_runners=2,
+            # Enable evaluation, once per training iteration.
+            evaluation_interval=1,
+            # Run 10 episodes each time evaluation runs (OR "auto" if parallel to
+            # training).
+            evaluation_duration="auto" if args.evaluation_parallel_to_training else 10,
+            # Evaluate parallelly to training?
+            evaluation_parallel_to_training=args.evaluation_parallel_to_training,
+            # Override the env settings for the eval workers.
+            # Note, though, that this setting here is only used in case --no-custom-eval
+            # is set, b/c in case the custom eval function IS used, we override the
+            # length of the eval environments in that custom function, so this setting
+            # here is simply ignored.
+            evaluation_config=AlgorithmConfig.overrides(
+                env_config={"corridor_length": args.corridor_length_training * 2},
+                # TODO (sven): Add support for window=float(inf) and reduce=mean for
+                #  evaluation episode_return_mean reductions (identical to old stack
+                #  behavior, which does NOT use a window (100 by default) to reduce
+                #  eval episode returns.
+                metrics_num_episodes_for_smoothing=5,
+            ),
+        )
+    )
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
+            args.stop_reward
+        ),
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+    }
+
+    run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        success_metric={
+            f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
+                args.stop_reward
+            ),
+        },
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3ecd849d295ce6d521ffed1c6a34115db5482fa
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ac5ef08b4170bba94ddbd71f82769db5f4d6eff
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5edfe9a837e94e261c7219abcb5fa7b6fdcde920
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a16ea8af2cc4163611f43ad2901ee84f37fec229
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffb38152ec6aaae1a0a6955f68f10c95239f2513
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a3b7a817aad215f2851619ed0157d2b794c16e2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py
@@ -0,0 +1,250 @@
+"""Example of using float16 precision for training and inference.
+
+This example:
+    - shows how to write a custom callback for RLlib to convert all RLModules
+    (on the EnvRunners and Learners) to float16 precision.
+    - shows how to write a custom env-to-module ConnectorV2 piece to convert all
+    observations and rewards in the collected trajectories to float16 (numpy) arrays.
+    - shows how to write a custom grad scaler for torch that is necessary to stabilize
+    learning with float16 weight matrices and gradients. This custom scaler behaves
+    exactly like the torch built-in `torch.amp.GradScaler` but also works for float16
+    gradients (which the torch built-in one doesn't).
+    - shows how to write a custom TorchLearner to change the epsilon setting (to the
+    much larger 1e-4 to stabilize learning) on the default optimizer (Adam) registered
+    for each RLModule.
+    - demonstrates how to plug in all the above custom components into an
+    `AlgorithmConfig` instance and start training (and inference) with float16
+    precision.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+
+
+Results to expect
+-----------------
+You should see something similar to the following on your terminal, when running this
+script with the above recommended options:
+
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|                             |            |                 |        |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_437ee_00000 | TERMINATED | 127.0.0.1:81045 |      6 |
++-----------------------------+------------+-----------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |    episode_return_mean |  num_episodes_lifetime |
+|                  |                        |                        |
+|------------------+------------------------+------------------------+
+|          71.3123 |                 153.79 |                    358 |
++------------------+------------------------+------------------------+
+"""
+import gymnasium as gym
+import numpy as np
+import torch
+
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = add_rllib_example_script_args(
+    default_iters=50, default_reward=150.0, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+)
+
+
+def on_algorithm_init(
+    algorithm: Algorithm,
+    **kwargs,
+) -> None:
+    """Callback making sure that all RLModules in the algo are `half()`'ed."""
+
+    # Switch all Learner RLModules to float16.
+    algorithm.learner_group.foreach_learner(
+        lambda learner: learner.module.foreach_module(lambda mid, mod: mod.half())
+    )
+    # Switch all EnvRunner RLModules (assuming single RLModules) to float16.
+    algorithm.env_runner_group.foreach_env_runner(
+        lambda env_runner: env_runner.module.half()
+    )
+    if algorithm.eval_env_runner_group:
+        algorithm.eval_env_runner_group.foreach_env_runner(
+            lambda env_runner: env_runner.module.half()
+        )
+
+
+class WriteObsAndRewardsAsFloat16(ConnectorV2):
+    """ConnectorV2 piece preprocessing observations and rewards to be float16.
+
+    Note that users can also write a gymnasium.Wrapper for observations and rewards
+    to achieve the same thing.
+    """
+
+    def recompute_output_observation_space(
+        self,
+        input_observation_space,
+        input_action_space,
+    ):
+        return gym.spaces.Box(
+            input_observation_space.low.astype(np.float16),
+            input_observation_space.high.astype(np.float16),
+            input_observation_space.shape,
+            np.float16,
+        )
+
+    def __call__(self, *, rl_module, batch, episodes, **kwargs):
+        for sa_episode in self.single_agent_episode_iterator(episodes):
+            obs = sa_episode.get_observations(-1)
+            float16_obs = obs.astype(np.float16)
+            sa_episode.set_observations(new_data=float16_obs, at_indices=-1)
+            if len(sa_episode) > 0:
+                rew = sa_episode.get_rewards(-1).astype(np.float16)
+                sa_episode.set_rewards(new_data=rew, at_indices=-1)
+        return batch
+
+
+class Float16GradScaler:
+    """Custom grad scaler for `TorchLearner`.
+
+    This class is utilizing the experimental support for the `TorchLearner`'s support
+    for loss/gradient scaling (analogous to how a `torch.amp.GradScaler` would work).
+
+    TorchLearner performs the following steps using this class (`scaler`):
+    - loss_per_module = TorchLearner.compute_losses()
+    - for L in loss_per_module: L = scaler.scale(L)
+    - grads = TorchLearner.compute_gradients()  # L.backward() on scaled loss
+    - TorchLearner.apply_gradients(grads):
+        for optim in optimizers:
+            scaler.step(optim)  # <- grads should get unscaled
+            scaler.update()  # <- update scaling factor
+    """
+
+    def __init__(
+        self,
+        init_scale=1000.0,
+        growth_factor=2.0,
+        backoff_factor=0.5,
+        growth_interval=2000,
+    ):
+        self._scale = init_scale
+        self.growth_factor = growth_factor
+        self.backoff_factor = backoff_factor
+        self.growth_interval = growth_interval
+        self._found_inf_or_nan = False
+        self.steps_since_growth = 0
+
+    def scale(self, loss):
+        # Scale the loss by `self._scale`.
+        return loss * self._scale
+
+    def get_scale(self):
+        return self._scale
+
+    def step(self, optimizer):
+        # Unscale the gradients for all model parameters and apply.
+        for group in optimizer.param_groups:
+            for param in group["params"]:
+                if param.grad is not None:
+                    param.grad.data.div_(self._scale)
+                    if torch.isinf(param.grad).any() or torch.isnan(param.grad).any():
+                        self._found_inf_or_nan = True
+                        break
+            if self._found_inf_or_nan:
+                break
+        # Only step if no inf/NaN grad found.
+        if not self._found_inf_or_nan:
+            optimizer.step()
+
+    def update(self):
+        # If gradients are found to be inf/NaN, reduce the scale.
+        if self._found_inf_or_nan:
+            self._scale *= self.backoff_factor
+            self.steps_since_growth = 0
+        # Increase the scale after a set number of steps without inf/NaN.
+        else:
+            self.steps_since_growth += 1
+            if self.steps_since_growth >= self.growth_interval:
+                self._scale *= self.growth_factor
+                self.steps_since_growth = 0
+        # Reset inf/NaN flag.
+        self._found_inf_or_nan = False
+
+
+class LargeEpsAdamTorchLearner(PPOTorchLearner):
+    """A TorchLearner overriding the default optimizer (Adam) to use non-default eps."""
+
+    @override(TorchLearner)
+    def configure_optimizers_for_module(self, module_id, config):
+        """Registers an Adam optimizer with a larg epsilon under the given module_id."""
+        params = list(self._module[module_id].parameters())
+
+        # Register one Adam optimizer (under the default optimizer name:
+        # DEFAULT_OPTIMIZER) for the `module_id`.
+        self.register_optimizer(
+            module_id=module_id,
+            # Create an Adam optimizer with a different eps for better float16
+            # stability.
+            optimizer=torch.optim.Adam(params, eps=1e-4),
+            params=params,
+            # Let RLlib handle the learning rate/learning rate schedule.
+            # You can leave `lr_or_lr_schedule` at None, but then you should
+            # pass a fixed learning rate into the Adam constructor above.
+            lr_or_lr_schedule=config.lr,
+        )
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("CartPole-v1")
+        # Plug in our custom callback (on_algorithm_init) to make all RLModules
+        # float16 models.
+        .callbacks(on_algorithm_init=on_algorithm_init)
+        # Plug in our custom loss scaler class to stabilize gradient computations
+        # (by scaling the loss, then unscaling the gradients before applying them).
+        # This is using the built-in, experimental feature of TorchLearner.
+        .experimental(_torch_grad_scaler_class=Float16GradScaler)
+        # Plug in our custom env-to-module ConnectorV2 piece to convert all observations
+        # and reward in the episodes (permanently) to float16.
+        .env_runners(env_to_module_connector=lambda env: WriteObsAndRewardsAsFloat16())
+        .training(
+            # Plug in our custom TorchLearner (using a much larger, stabilizing epsilon
+            # on the Adam optimizer).
+            learner_class=LargeEpsAdamTorchLearner,
+            # Switch off grad clipping entirely b/c we use our custom grad scaler with
+            # built-in inf/nan detection (see `step` method of `Float16GradScaler`).
+            grad_clip=None,
+            # Typical CartPole-v1 hyperparams known to work well:
+            gamma=0.99,
+            lr=0.0003,
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+            use_kl_loss=True,
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..374a7ec139e966a44911a82909ad97bf87f617bc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py
@@ -0,0 +1,119 @@
+"""Example of using fractional GPUs (< 1.0) per Learner worker.
+
+The number of GPUs required, just for learning (excluding those maybe needed on your
+EnvRunners, if applicable) can be computed by:
+`num_gpus = config.num_learners * config.num_gpus_per_learner`
+
+This example:
+  - shows how to set up an Algorithm that uses one or more Learner workers ...
+  - ... and how to assign a fractional (< 1.0) number of GPUs to each of these Learners.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-learners=
+[number of Learners, e.g. 1] --num-gpus-per-learner [some fraction <1.0]`
+
+The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU)
+machine.
+Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4
+learning rates in the `base_config` below:
+1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used).
+2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used).
+3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used).
+4) --num-learners=2 --num-gpus-per-learner=1 (8 GPUs used).
+5) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an
+NCCL-related error due to the fact that torch will try to perform DDP sharding,
+but notices that the shards sit on the same GPU).
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+Note that the shown GPU settings in this script also work in case you are not
+running via tune, but instead are using the `--no-tune` command line option.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+
+
+Results to expect
+-----------------
+In the console output, you can see that only fractional GPUs are being used by RLlib:
+
+== Status ==
+...
+Logical resource usage: 12.0/16 CPUs, 1.0/4 GPUs (...)
+...
+Number of trials: 4/4 (4 RUNNING)
+
+The final output should look something like this:
++-----------------------------+------------+-----------------+--------+--------+
+| Trial name                  | status     | loc             |     lr |   iter |
+|                             |            |                 |        |        |
+|-----------------------------+------------+-----------------+--------+--------+
+| PPO_CartPole-v1_7104b_00000 | TERMINATED | 10.0.0.39:31197 | 0.005  |     10 |
+| PPO_CartPole-v1_7104b_00001 | TERMINATED | 10.0.0.39:31202 | 0.003  |     11 |
+| PPO_CartPole-v1_7104b_00002 | TERMINATED | 10.0.0.39:31203 | 0.001  |     10 |
+| PPO_CartPole-v1_7104b_00003 | TERMINATED | 10.0.0.39:31204 | 0.0001 |     11 |
++-----------------------------+------------+-----------------+--------+--------+
+
++----------------+----------------------+----------------------+----------------------+
+| total time (s) | num_env_steps_sample | num_env_steps_traine | num_episodes_lifetim |
+|                |           d_lifetime |           d_lifetime |                    e |
+|----------------+----------------------+----------------------+----------------------|
+|        101.002 |                40000 |                40000 |                  346 |
+|        110.03  |                44000 |                44000 |                  395 |
+|        101.171 |                40000 |                40000 |                  328 |
+|        110.091 |                44000 |                44000 |                  478 |
++----------------+----------------------+----------------------+----------------------+
+"""
+from ray import tune
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = add_rllib_example_script_args(
+    default_iters=50, default_reward=180, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_env_runners=2,
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        # This script only works on the new API stack.
+        .api_stack(
+            enable_rl_module_and_learner=True,
+            enable_env_runner_and_connector_v2=True,
+        )
+        .environment("CartPole-v1")
+        # Define EnvRunner scaling.
+        .env_runners(num_env_runners=args.num_env_runners)
+        # Define Learner scaling.
+        .learners(
+            # How many Learner workers do we need? If you have more than 1 GPU,
+            # set this parameter to the number of GPUs available.
+            num_learners=args.num_learners,
+            # How many GPUs does each Learner need? If you have more than 1 GPU or only
+            # one Learner, you should set this to 1, otherwise, set this to some
+            # fraction.
+            num_gpus_per_learner=args.num_gpus_per_learner,
+        )
+        # 4 tune trials altogether.
+        .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001]))
+    )
+
+    run_rllib_example_script_experiment(base_config, args, keep_config=True)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a5bd1f53b335943777e320aaafee363068144f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py
@@ -0,0 +1,85 @@
+"""Example of using GPUs on the EnvRunners (b/c Env and/or RLModule require these).
+
+The number of GPUs required, just for your EnvRunners (excluding those needed for
+training your RLModule) can be computed by:
+`num_gpus = config.num_env_runners * config.num_gpus_per_env_runner`
+
+This example:
+  - shows how to write an Env that uses the GPU.
+  - shows how to configure your algorithm such that it allocates any number of GPUs
+  (including fractional < 1.0) to each (remote) EnvRunner worker.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-env_runners=
+[number of EnvRunners, e.g. 2] --num-gpus-per-env-runner [int or some fraction <1.0]`
+
+The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU)
+machine.
+TODO (sven): Fix these
+Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4
+learning rates in the `base_config` below:
+1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used).
+2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used).
+3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used).
+4) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an
+NCCL-related error due to the fact that torch will try to perform DDP sharding,
+but notices that the shards sit on the same GPU).
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+Note that the shown GPU settings in this script also work in case you are not
+running via tune, but instead are using the `--no-tune` command line option.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+
+
+Results to expect
+-----------------
+In the console output, you can see that only fractional GPUs are being used by RLlib:
+
+"""
+from ray.rllib.examples.envs.classes.gpu_requiring_env import GPURequiringEnv
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = add_rllib_example_script_args(
+    default_iters=50, default_reward=0.9, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_env_runners=2,
+)
+parser.add_argument("--num-gpus-per-env-runner", type=float, default=0.5)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(GPURequiringEnv)
+        # Define Learner scaling.
+        .env_runners(
+            # How many EnvRunner workers do we need?
+            num_env_runners=args.num_env_runners,
+            # How many GPUs does each EnvRunner require? Note that the memory on (a
+            # possibly fractional GPU) must be enough to accommodate the RLModule AND
+            # if applicable also the Env's GPU needs).
+            num_gpus_per_env_runner=args.num_gpus_per_env_runner,
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32887146bfd5ca1a3fecc33d3de38cafac8d479
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py
@@ -0,0 +1,170 @@
+"""Example of using automatic mixed precision training on a torch RLModule.
+
+This example:
+    - shows how to write a custom callback for RLlib to convert those RLModules
+    only(!) on the EnvRunners to float16 precision.
+    - shows how to write a custom env-to-module ConnectorV2 piece to add float16
+    observations to the action computing forward batch on the EnvRunners, but NOT
+    permanently write these changes into the episodes, such that on the
+    Learner side, the original float32 observations will be used (for the mixed
+    precision `forward_train` and `loss` computations).
+    - shows how to plugin torch's built-in `GradScaler` class to be used by the
+    TorchLearner to scale losses and unscale gradients in order to gain more stability
+    when training with mixed precision.
+    - shows how to write a custom TorchLearner to run the update step (overrides
+    `_update()`) within a `torch.amp.autocast()` context. This makes sure that .
+    - demonstrates how to plug in all the above custom components into an
+    `AlgorithmConfig` instance and start training with mixed-precision while
+    performing the inference on the EnvRunners with float16 precision.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+Note that the shown GPU settings in this script also work in case you are not
+running via tune, but instead are using the `--no-tune` command line option.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+
+
+Results to expect
+-----------------
+In the console output, you should see something like this:
+
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|                             |            |                 |        |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_485af_00000 | TERMINATED | 127.0.0.1:81045 |     22 |
++-----------------------------+------------+-----------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |    episode_return_mean |  num_episodes_lifetime |
+|                  |                        |                        |
+|------------------+------------------------+------------------------+
+|         281.3231 |                 455.81 |                   1426 |
++------------------+------------------------+------------------------+
+"""
+import gymnasium as gym
+import numpy as np
+import torch
+
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+
+parser = add_rllib_example_script_args(
+    default_iters=200, default_reward=450.0, default_timesteps=200000
+)
+parser.set_defaults(
+    algo="PPO",
+    enable_new_api_stack=True,
+)
+
+
+def on_algorithm_init(
+    algorithm: Algorithm,
+    **kwargs,
+) -> None:
+    """Callback making sure that all RLModules in the algo are `half()`'ed."""
+
+    # Switch all EnvRunner RLModules (assuming single RLModules) to float16.
+    algorithm.env_runner_group.foreach_env_runner(
+        lambda env_runner: env_runner.module.half()
+    )
+    if algorithm.eval_env_runner_group:
+        algorithm.eval_env_runner_group.foreach_env_runner(
+            lambda env_runner: env_runner.module.half()
+        )
+
+
+class Float16Connector(ConnectorV2):
+    """ConnectorV2 piece preprocessing observations and rewards to be float16.
+
+    Note that users can also write a gymnasium.Wrapper for observations and rewards
+    to achieve the same thing.
+    """
+
+    def recompute_output_observation_space(
+        self,
+        input_observation_space,
+        input_action_space,
+    ):
+        return gym.spaces.Box(
+            input_observation_space.low.astype(np.float16),
+            input_observation_space.high.astype(np.float16),
+            input_observation_space.shape,
+            np.float16,
+        )
+
+    def __call__(self, *, rl_module, batch, episodes, **kwargs):
+        for sa_episode in self.single_agent_episode_iterator(episodes):
+            obs = sa_episode.get_observations(-1)
+            float16_obs = obs.astype(np.float16)
+            self.add_batch_item(
+                batch,
+                column="obs",
+                item_to_add=float16_obs,
+                single_agent_episode=sa_episode,
+            )
+        return batch
+
+
+class PPOTorchMixedPrecisionLearner(PPOTorchLearner):
+    def _update(self, *args, **kwargs):
+        with torch.cuda.amp.autocast():
+            results = super()._update(*args, **kwargs)
+        return results
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+    assert args.algo == "PPO", "Must set --algo=PPO when running this script!"
+
+    base_config = (
+        (PPOConfig().environment("CartPole-v1"))
+        .env_runners(env_to_module_connector=lambda env: Float16Connector())
+        # Plug in our custom callback (on_algorithm_init) to make EnvRunner RLModules
+        # float16 models.
+        .callbacks(on_algorithm_init=on_algorithm_init)
+        # Plug in the torch built-int loss scaler class to stabilize gradient
+        # computations (by scaling the loss, then unscaling the gradients before
+        # applying them). This is using the built-in, experimental feature of
+        # TorchLearner.
+        .experimental(_torch_grad_scaler_class=torch.cuda.amp.GradScaler)
+        .training(
+            # Plug in the custom Learner class to activate mixed-precision training for
+            # our torch RLModule (uses `torch.amp.autocast()`).
+            learner_class=PPOTorchMixedPrecisionLearner,
+            # Switch off grad clipping entirely b/c we use our custom grad scaler with
+            # built-in inf/nan detection (see `step` method of `Float16GradScaler`).
+            grad_clip=None,
+            # Typical CartPole-v1 hyperparams known to work well:
+            gamma=0.99,
+            lr=0.0003,
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+            use_kl_loss=True,
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce29ca31728d25af476b57b877766726ccafec86
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8464202293a7d3edc35d22b991d8bdbec1e59c5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a161aa3acce08c59e054e0c7a510a7fbff73dcf
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbc33a256bcdbf9eacd28c8f101bda37f68e1906
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/custom_ppo_loss_fn_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/custom_ppo_loss_fn_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63cd3c563e0e8d8434d7a8f1dce3bdc9313a060
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/custom_ppo_loss_fn_learner.py
@@ -0,0 +1,54 @@
+from typing import Any, Dict
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModuleID, TensorType
+
+torch, _ = try_import_torch()
+
+
+class PPOTorchLearnerWithWeightRegularizerLoss(PPOTorchLearner):
+    """A custom PPO torch learner adding a weight regularizer term to the loss.
+
+    We compute a naive regularizer term averaging over all parameters of the RLModule
+    and add this mean value (multiplied by the regularizer coefficient) to the base PPO
+    loss.
+    The experiment shows that even with a large learning rate, our custom Learner is
+    still able to learn properly as it's forced to keep the weights small.
+    """
+
+    @override(PPOTorchLearner)
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: PPOConfig,
+        batch: Dict[str, Any],
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+
+        base_total_loss = super().compute_loss_for_module(
+            module_id=module_id,
+            config=config,
+            batch=batch,
+            fwd_out=fwd_out,
+        )
+
+        # Compute the mean of all the RLModule's weights.
+        parameters = self.get_parameters(self.module[module_id])
+        mean_weight = torch.mean(torch.stack([w.mean() for w in parameters]))
+
+        self.metrics.log_value(
+            key=(module_id, "mean_weight"),
+            value=mean_weight,
+            window=1,
+        )
+
+        total_loss = (
+            base_total_loss
+            + config.learner_config_dict["regularizer_coeff"] * mean_weight
+        )
+
+        return total_loss
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/intrinsic_curiosity_learners.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/intrinsic_curiosity_learners.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd37dab0cb114d637a94d35897d11ef4082b629e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/intrinsic_curiosity_learners.py
@@ -0,0 +1,162 @@
+from typing import Any, List, Optional
+
+import gymnasium as gym
+import torch
+
+from ray.rllib.algorithms.dqn.torch.dqn_torch_learner import DQNTorchLearner
+from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
+from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import (
+    AddObservationsFromEpisodesToBatch,
+)
+from ray.rllib.connectors.common.numpy_to_tensor import NumpyToTensor
+from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import (  # noqa
+    AddNextObservationsFromEpisodesToTrainBatch,
+)
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core import Columns, DEFAULT_MODULE_ID
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.typing import EpisodeType
+
+ICM_MODULE_ID = "_intrinsic_curiosity_model"
+
+
+class DQNTorchLearnerWithCuriosity(DQNTorchLearner):
+    def build(self) -> None:
+        super().build()
+        add_intrinsic_curiosity_connectors(self)
+
+
+class PPOTorchLearnerWithCuriosity(PPOTorchLearner):
+    def build(self) -> None:
+        super().build()
+        add_intrinsic_curiosity_connectors(self)
+
+
+def add_intrinsic_curiosity_connectors(torch_learner: TorchLearner) -> None:
+    """Adds two connector pieces to the Learner pipeline, needed for ICM training.
+
+    - The `AddNextObservationsFromEpisodesToTrainBatch` connector makes sure the train
+    batch contains the NEXT_OBS for ICM's forward- and inverse dynamics net training.
+    - The `IntrinsicCuriosityModelConnector` piece computes intrinsic rewards from the
+    ICM and adds the results to the extrinsic reward of the main module's train batch.
+
+    Args:
+        torch_learner: The TorchLearner, to whose Learner pipeline the two ICM connector
+            pieces should be added.
+    """
+    learner_config_dict = torch_learner.config.learner_config_dict
+
+    # Assert, we are only training one policy (RLModule) and we have the ICM
+    # in our MultiRLModule.
+    assert (
+        len(torch_learner.module) == 2
+        and DEFAULT_MODULE_ID in torch_learner.module
+        and ICM_MODULE_ID in torch_learner.module
+    )
+
+    # Make sure both curiosity loss settings are explicitly set in the
+    # `learner_config_dict`.
+    if (
+        "forward_loss_weight" not in learner_config_dict
+        or "intrinsic_reward_coeff" not in learner_config_dict
+    ):
+        raise KeyError(
+            "When using the IntrinsicCuriosityTorchLearner, both `forward_loss_weight` "
+            " and `intrinsic_reward_coeff` must be part of your config's "
+            "`learner_config_dict`! Add these values through: `config.training("
+            "learner_config_dict={'forward_loss_weight': .., 'intrinsic_reward_coeff': "
+            "..})`."
+        )
+
+    if torch_learner.config.add_default_connectors_to_learner_pipeline:
+        # Prepend a "add-NEXT_OBS-from-episodes-to-train-batch" connector piece
+        # (right after the corresponding "add-OBS-..." default piece).
+        torch_learner._learner_connector.insert_after(
+            AddObservationsFromEpisodesToBatch,
+            AddNextObservationsFromEpisodesToTrainBatch(),
+        )
+        # Append the ICM connector, computing intrinsic rewards and adding these to
+        # the main model's extrinsic rewards.
+        torch_learner._learner_connector.insert_after(
+            NumpyToTensor,
+            IntrinsicCuriosityModelConnector(
+                intrinsic_reward_coeff=(
+                    torch_learner.config.learner_config_dict["intrinsic_reward_coeff"]
+                )
+            ),
+        )
+
+
+class IntrinsicCuriosityModelConnector(ConnectorV2):
+    """Learner ConnectorV2 piece to compute intrinsic rewards based on an ICM.
+
+    For more details, see here:
+    [1] Curiosity-driven Exploration by Self-supervised Prediction
+    Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017.
+    https://arxiv.org/pdf/1705.05363.pdf
+
+    This connector piece:
+    - requires two RLModules to be present in the MultiRLModule:
+    DEFAULT_MODULE_ID (the policy model to be trained) and ICM_MODULE_ID (the instrinsic
+    curiosity architecture).
+    - must be located toward the end of to your Learner pipeline (after the
+    `NumpyToTensor` piece) in order to perform a forward pass on the ICM model with the
+    readily compiled batch and a following forward-loss computation to get the intrinsi
+    rewards.
+    - these intrinsic rewards will then be added to the (extrinsic) rewards in the main
+    model's train batch.
+    """
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        intrinsic_reward_coeff: float,
+        **kwargs,
+    ):
+        """Initializes a CountBasedCuriosity instance.
+
+        Args:
+            intrinsic_reward_coeff: The weight with which to multiply the intrinsic
+                reward before adding it to the extrinsic rewards of the main model.
+        """
+        super().__init__(input_observation_space, input_action_space)
+
+        self.intrinsic_reward_coeff = intrinsic_reward_coeff
+
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Any,
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Assert that the batch is ready.
+        assert DEFAULT_MODULE_ID in batch and ICM_MODULE_ID not in batch
+        assert (
+            Columns.OBS in batch[DEFAULT_MODULE_ID]
+            and Columns.NEXT_OBS in batch[DEFAULT_MODULE_ID]
+        )
+        # TODO (sven): We are performing two forward passes per update right now.
+        #  Once here in the connector (w/o grad) to just get the intrinsic rewards
+        #  and once in the learner to actually compute the ICM loss and update the ICM.
+        #  Maybe we can save one of these, but this would currently harm the DDP-setup
+        #  for multi-GPU training.
+        with torch.no_grad():
+            # Perform ICM forward pass.
+            fwd_out = rl_module[ICM_MODULE_ID].forward_train(batch[DEFAULT_MODULE_ID])
+
+        # Add the intrinsic rewards to the main module's extrinsic rewards.
+        batch[DEFAULT_MODULE_ID][Columns.REWARDS] += (
+            self.intrinsic_reward_coeff * fwd_out[Columns.INTRINSIC_REWARDS]
+        )
+
+        # Duplicate the batch such that the ICM also has data to learn on.
+        batch[ICM_MODULE_ID] = batch[DEFAULT_MODULE_ID]
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/separate_vf_lr_and_optimizer_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/separate_vf_lr_and_optimizer_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..7095503f5f620bb7d83ec50d4b2494bbf0a7ea3b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/separate_vf_lr_and_optimizer_learner.py
@@ -0,0 +1,83 @@
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModuleID
+
+torch, _ = try_import_torch()
+
+
+class PPOTorchLearnerWithSeparateVfOptimizer(PPOTorchLearner):
+    """A custom PPO torch learner with 2 optimizers, for policy and value function.
+
+    Overrides the Learner's standard `configure_optimizers_for_module()` method to
+    register the additional vf optimizer.
+
+    The standard PPOLearner only uses a single optimizer (and single learning rate) to
+    update the model, regardless of whether the value function network
+    is separate from the policy network or whether they have shared components.
+
+    We may leave the loss function of PPO completely untouched. It already returns a
+    sum of policy loss and vf loss (and entropy loss), and thus - given that the neural
+    networks used to compute each of these terms are separate and don't share any
+    components - gradients are computed separately per neural network (policy vs vf)
+    and applied separately through the two optimizers.
+    """
+
+    @override(TorchLearner)
+    def configure_optimizers_for_module(
+        self,
+        module_id: ModuleID,
+        config: "AlgorithmConfig" = None,
+    ) -> None:
+        """Registers 2 optimizers for the given ModuleID with this Learner."""
+
+        # Make sure the RLModule has the correct properties.
+        module = self.module[module_id]
+        # TODO (sven): We should move this into a new `ValueFunction` API, which
+        #  should has-a `get_value_function_params` method. This way, any custom
+        #  RLModule that implements this API can be used here, not just the standard
+        #  PPO one.
+        assert (
+            hasattr(module, "pi")
+            and hasattr(module, "vf")
+            and hasattr(module, "encoder")
+            and hasattr(module.encoder, "actor_encoder")
+            and hasattr(module.encoder, "critic_encoder")
+        )
+        assert config.model_config["vf_share_layers"] is False
+
+        # Get all policy-related parameters from the RLModule.
+        pi_params = (
+            # Actor encoder and policy head.
+            self.get_parameters(self.module[module_id].encoder.actor_encoder)
+            + self.get_parameters(self.module[module_id].pi)
+        )
+        # Register the policy optimizer.
+        self.register_optimizer(
+            module_id=module_id,
+            optimizer_name="optim_for_pi",
+            optimizer=torch.optim.Adam(params=pi_params),
+            params=pi_params,
+            # For the policy learning rate, we use the "main" lr in the AlgorithmConfig.
+            lr_or_lr_schedule=config.lr,
+        )
+
+        # Get all value function-related parameters from the RLModule.
+        vf_params = (
+            # Critic encoder and value head.
+            self.get_parameters(self.module[module_id].encoder.critic_encoder)
+            + self.get_parameters(self.module[module_id].vf)
+        )
+        # Register the value function optimizer.
+        self.register_optimizer(
+            module_id=module_id,
+            optimizer_name="optim_for_vf",
+            optimizer=torch.optim.Adam(params=vf_params),
+            params=vf_params,
+            # For the value function learning rate, we use a user-provided custom
+            # setting in the `learner_config_dict` in the AlgorithmConfig. If this
+            # is not provided, use the same lr as for the policy optimizer.
+            lr_or_lr_schedule=config.learner_config_dict.get("lr_vf", config.lr),
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/vpg_torch_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/vpg_torch_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5aca70e135429407be06d975adac878a4abe8e7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/vpg_torch_learner.py
@@ -0,0 +1,73 @@
+import torch
+from typing import Any, Dict, TYPE_CHECKING
+
+import numpy as np
+
+from ray.rllib.connectors.learner import ComputeReturnsToGo
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import ModuleID, TensorType
+
+if TYPE_CHECKING:
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+
+
+class VPGTorchLearner(TorchLearner):
+    @override(TorchLearner)
+    def build(self) -> None:
+        super().build()
+
+        # Prepend the returns-to-go connector piece to have that information
+        # available in the train batch.
+        if self.config.add_default_connectors_to_learner_pipeline:
+            self._learner_connector.prepend(ComputeReturnsToGo(gamma=self.config.gamma))
+
+    @override(TorchLearner)
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: "AlgorithmConfig",
+        batch: Dict[str, Any],
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        rl_module = self.module[module_id]
+
+        # Create the action distribution from the parameters output by the RLModule.
+        action_dist_inputs = fwd_out[Columns.ACTION_DIST_INPUTS]
+        action_dist_class = rl_module.get_train_action_dist_cls()
+        action_dist = action_dist_class.from_logits(action_dist_inputs)
+
+        # Compute log probabilities of the actions taken during sampling.
+        log_probs = action_dist.logp(batch[Columns.ACTIONS])
+
+        # Compute the policy gradient loss.
+        # Since we're not using a baseline, we use returns to go directly.
+        loss = -torch.mean(log_probs * batch[Columns.RETURNS_TO_GO])
+
+        # Just for exercise, log the average return to go per discrete action.
+        for act, ret_to_go in zip(batch[Columns.ACTIONS], batch[Columns.RETURNS_TO_GO]):
+            self.metrics.log_value(
+                key=(module_id, f"action_{act}_return_to_go_mean"),
+                value=ret_to_go,
+                # Mean over the batch size.
+                reduce="mean",
+                window=len(batch[Columns.RETURNS_TO_GO]),
+            )
+
+        return loss
+
+    @override(Learner)
+    def after_gradient_based_update(self, *, timesteps):
+        # This is to check if in the multi-gpu case, the weights across workers are
+        # the same. Only for testing purposes.
+        if self.config.report_mean_weights:
+            for module_id in self.module.keys():
+                parameters = convert_to_numpy(
+                    self.get_parameters(self.module[module_id])
+                )
+                mean_ws = np.mean([w.mean() for w in parameters])
+                self.metrics.log_value((module_id, "mean_weight"), mean_ws, window=1)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_load_rl_modules.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_load_rl_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..61cfe730a46523cabe9c62542a49b86356a4cdb2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_load_rl_modules.py
@@ -0,0 +1,78 @@
+import argparse
+import gymnasium as gym
+import shutil
+import tempfile
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog
+from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule
+from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+
+
+def _parse_args():
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--framework",
+        choices=["tf2", "torch"],  # tf will be deprecated with the new Learner stack
+        default="torch",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = _parse_args()
+
+    ray.init()
+
+    # Create a module to load and save it to a checkpoint for testing purposes
+    # (this is not necessary in a real use case)
+    # In a real case you would just load the checkpoint from a rllib training run
+    # where you had enabled checkpointing, the learner api and the rl module api
+    module_class = PPOTfRLModule if args.framework == "tf2" else PPOTorchRLModule
+    env = gym.make("CartPole-v1")
+    module_to_load = RLModuleSpec(
+        module_class=module_class,
+        model_config=DefaultModelConfig(fcnet_hiddens=[32]),
+        catalog_class=PPOCatalog,
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+    ).build()
+
+    CHECKPOINT_DIR = tempfile.mkdtemp()
+    module_to_load.save_to_path(CHECKPOINT_DIR)
+
+    # Create a module spec to load the checkpoint
+    module_to_load_spec = RLModuleSpec(
+        module_class=module_class,
+        model_config=DefaultModelConfig(fcnet_hiddens=[32]),
+        catalog_class=PPOCatalog,
+        load_state_path=CHECKPOINT_DIR,
+    )
+
+    # train a PPO algorithm with the loaded module
+    config = (
+        PPOConfig()
+        .api_stack(enable_rl_module_and_learner=True)
+        .framework(args.framework)
+        .rl_module(rl_module_spec=module_to_load_spec)
+        .environment("CartPole-v1")
+    )
+
+    tuner = tune.Tuner(
+        "PPO",
+        param_space=config.to_dict(),
+        run_config=air.RunConfig(
+            stop={TRAINING_ITERATION: 1},
+            failure_config=air.FailureConfig(fail_fast="raise"),
+        ),
+    )
+    tuner.fit()
+    shutil.rmtree(CHECKPOINT_DIR)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_custom_loss_fn.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_custom_loss_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..04cb17c6f8934201172df580a76cc9455e9847b2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_custom_loss_fn.py
@@ -0,0 +1,138 @@
+"""Example of how to write a custom loss function (based on the existing PPO loss).
+
+This example shows:
+    - how to subclass an existing (torch) Learner and override its
+    `compute_loss_for_module()` method.
+    - how you can add your own loss terms to the subclassed "base loss", in this
+    case here a weights regularizer term with the intention to keep the learnable
+    parameters of the RLModule reasonably small.
+    - how to add custom settings (here: the regularizer coefficient) to the
+    `AlgorithmConfig` in order to not have to subclass and write your own
+    (you could still do that, but are not required to).
+    - how to plug in the custom Learner into your config and then run the
+    experiment.
+
+See the :py:class:`~ray.rllib.examples.learners.classes.custom_loss_fn_learner.PPOTorchLearnerWithWeightRegularizerLoss`  # noqa
+class for details on how to override the main (PPO) loss function.
+
+We compute a naive regularizer term averaging over all parameters of the RLModule and
+add this mean value (multiplied by the regularizer coefficient) to the base PPO loss.
+The experiment shows that even with a large learning rate, our custom Learner is still
+able to learn properly as it's forced to keep the weights small.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --regularizer-coeff=0.02
+--lr=0.01`
+
+Use the `--regularizer-coeff` option to set the value of the coefficient with which
+the mean NN weight is being multiplied (inside the total loss) and the `--lr` option
+to set the learning rate. Experiments using a large learning rate and no regularization
+(`--regularizer-coeff=0.0`) should NOT learn a decently working policy.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see that - given a large learning rate - only with
+weight regularization (`--regularizer-coeff` > 0.0), the algo has a chance to learn
+a decent policy:
+
+With --regularizer-coeff=0.02 and --lr=0.01
+(trying to reach 250.0 return on CartPole in 100k env steps):
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|                             |            |                 |        |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_4a3a0_00000 | TERMINATED | 127.0.0.1:16845 |     18 |
++-----------------------------+------------+-----------------+--------+
++------------------+------------------------+---------------------+
+|   total time (s) | num_env_steps_sampled_ | episode_return_mean |
+|                  |              _lifetime |                     |
+|------------------+------------------------+---------------------+
+|          16.8842 |                  72000 |              256.35 |
++------------------+------------------------+---------------------+
+
+With --regularizer-coeff=0.0 and --lr=0.01
+(trying to reach 250.0 return on CartPole in 100k env steps):
+
+[HAS SIGNIFICANT PROBLEMS REACHING THE DESIRED RETURN]
+"""
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.learners.classes.custom_ppo_loss_fn_learner import (
+    PPOTorchLearnerWithWeightRegularizerLoss,
+)
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+torch, _ = try_import_torch()
+
+
+parser = add_rllib_example_script_args(
+    default_reward=250.0,
+    default_timesteps=200000,
+)
+parser.set_defaults(enable_new_api_stack=True)
+parser.add_argument(
+    "--regularizer-coeff",
+    type=float,
+    default=0.02,
+    help="The coefficient with which to multiply the mean NN-weight by (and then add "
+    "the result of this operation to the main loss term).",
+)
+parser.add_argument(
+    "--lr",
+    type=float,
+    default=0.01,
+    help="The learning rate to use.",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+    assert args.algo == "PPO", "Must set --algo=PPO when running this script!"
+
+    base_config = (
+        PPOConfig()
+        .environment("CartPole-v1")
+        .training(
+            # This is the most important setting in this script: We point our PPO
+            # algorithm to use the custom Learner (instead of the default
+            # PPOTorchLearner).
+            learner_class=PPOTorchLearnerWithWeightRegularizerLoss,
+            # We use this simple method here to inject a new setting that our
+            # custom Learner class uses in its loss function. This is convenient
+            # and avoids having to subclass `PPOConfig` only to add a few new settings
+            # to it. Within our Learner, we can access this new setting through:
+            # `self.config.learner_config_dict['regularizer_coeff']`
+            learner_config_dict={"regularizer_coeff": args.regularizer_coeff},
+            # Some settings to make this example learn better.
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+            # The learning rate, settable through the command line `--lr` arg.
+            lr=args.lr,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(vf_share_layers=True),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_torch_lr_schedulers.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_torch_lr_schedulers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2051076613c3dbc85e1ba9590bd09ef3d0a30236
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_torch_lr_schedulers.py
@@ -0,0 +1,209 @@
+"""Example of how to use PyTorch's learning rate schedulers to design a complex
+learning rate schedule for training.
+
+Two learning rate schedules are applied in sequence to the learning rate of the
+optimizer. In this way even more complex learning rate schedules can be assembled.
+
+This example shows:
+    - how to configure multiple learning rate schedulers, as a chained pipeline, in
+    PyTorch using partial initialization with `functools.partial`.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --lr-const-factor=0.9
+--lr-const-iters=10 --lr-exp-decay=0.9`
+
+Use the `--lr-const-factor` to define the facotr by which to multiply the
+learning rate in the first `--lr-const-iters` iterations. Use the
+`--lr-const-iters` to set the number of iterations in which the learning rate
+should be adapted by the `--lr-const-factor`. Use `--lr-exp-decay` to define
+the learning rate decay to be applied after the constant factor multiplication.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should expect to observe decent learning behavior from your console output:
+
+With `--lr-const-factor=0.1`, `--lr-const-iters=10, and `--lr-exp_decay=0.3`.
++-----------------------------+------------+--------+------------------+
+| Trial name                  | status     |   iter |   total time (s) |
+|                             |            |        |                  |
+|-----------------------------+------------+--------+------------------+
+| PPO_CartPole-v1_7fc44_00000 | TERMINATED |     50 |          59.6542 |
++-----------------------------+------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|    episode_return_mean |  num_episodes_lifetime |   num_env_steps_traine |
+|                        |                        |             d_lifetime |
++------------------------+------------------------+------------------------|
+|                  451.2 |                   9952 |                 210047 |
++------------------------+------------------------+------------------------+
+"""
+import functools
+import numpy as np
+from typing import Optional
+
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.learner.learner import DEFAULT_OPTIMIZER
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    EVALUATION_RESULTS,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+
+torch, _ = try_import_torch()
+
+
+class LRChecker(RLlibCallback):
+    def on_algorithm_init(
+        self,
+        *,
+        algorithm: "Algorithm",
+        metrics_logger: Optional[MetricsLogger] = None,
+        **kwargs,
+    ) -> None:
+        # Store the expected learning rates for each iteration.
+        self.lr = []
+        # Retrieve the chosen configuration parameters from the config.
+        lr_factor = algorithm.config._torch_lr_scheduler_classes[0].keywords["factor"]
+        lr_total_iters = algorithm.config._torch_lr_scheduler_classes[0].keywords[
+            "total_iters"
+        ]
+        lr_gamma = algorithm.config._torch_lr_scheduler_classes[1].keywords["gamma"]
+        # Compute the learning rates for all iterations up to `lr_const_iters`.
+        for i in range(1, lr_total_iters + 1):
+            # The initial learning rate.
+            lr = algorithm.config.lr
+            # In the first 10 iterations we multiply by `lr_const_factor`.
+            if i < lr_total_iters:
+                lr *= lr_factor
+            # Finally, we have an exponential decay of `lr_exp_decay`.
+            lr *= lr_gamma**i
+            self.lr.append(lr)
+
+    def on_train_result(
+        self,
+        *,
+        algorithm: "Algorithm",
+        metrics_logger: Optional[MetricsLogger] = None,
+        result: dict,
+        **kwargs,
+    ) -> None:
+
+        # Check for the first `lr_total_iters + 1` iterations, if expected
+        # and actual learning rates correspond.
+        if (
+            algorithm.training_iteration
+            <= algorithm.config._torch_lr_scheduler_classes[0].keywords["total_iters"]
+        ):
+            actual_lr = algorithm.learner_group._learner.get_optimizer(
+                DEFAULT_MODULE_ID, DEFAULT_OPTIMIZER
+            ).param_groups[0]["lr"]
+            # Assert the learning rates are close enough.
+            assert np.isclose(
+                actual_lr,
+                self.lr[algorithm.training_iteration - 1],
+                atol=1e-9,
+                rtol=1e-9,
+            )
+
+
+parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=250000)
+parser.set_defaults(enable_new_api_stack=True)
+parser.add_argument(
+    "--lr-const-factor",
+    type=float,
+    default=0.9,
+    help="The factor by which the learning rate should be multiplied.",
+)
+parser.add_argument(
+    "--lr-const-iters",
+    type=int,
+    default=10,
+    help=(
+        "The number of iterations by which the learning rate should be "
+        "multiplied by the factor."
+    ),
+)
+parser.add_argument(
+    "--lr-exp-decay",
+    type=float,
+    default=0.99,
+    help="The rate by which the learning rate should exponentially decay.",
+)
+
+if __name__ == "__main__":
+    # Use `parser` to add your own custom command line options to this script
+    # and (if needed) use their values to set up `config` below.
+    args = parser.parse_args()
+
+    config = (
+        PPOConfig()
+        .environment("CartPole-v1")
+        .training(
+            lr=0.03,
+            num_sgd_iter=6,
+            vf_loss_coeff=0.01,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                fcnet_hiddens=[32],
+                fcnet_activation="linear",
+                vf_share_layers=True,
+            ),
+        )
+        .evaluation(
+            evaluation_num_env_runners=1,
+            evaluation_interval=1,
+            evaluation_parallel_to_training=True,
+            evaluation_config=PPOConfig.overrides(exploration=False),
+        )
+        .experimental(
+            # Add two learning rate schedulers to be applied in sequence.
+            _torch_lr_scheduler_classes=[
+                # Multiplies the learning rate by a factor of 0.1 for 10 iterations.
+                functools.partial(
+                    torch.optim.lr_scheduler.ConstantLR,
+                    factor=args.lr_const_factor,
+                    total_iters=args.lr_const_iters,
+                ),
+                # Decays the learning rate after each gradients step by
+                # `args.lr_exp_decay`.
+                functools.partial(
+                    torch.optim.lr_scheduler.ExponentialLR, gamma=args.lr_exp_decay
+                ),
+            ]
+        )
+        .callbacks(
+            LRChecker,
+        )
+    )
+
+    stop = {
+        f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps,
+        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
+            args.stop_reward
+        ),
+    }
+
+    if __name__ == "__main__":
+        from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
+        run_rllib_example_script_experiment(config, args, stop=stop)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/separate_vf_lr_and_optimizer.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/separate_vf_lr_and_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e5359f1162b9b280e08273c81cb10284d1d8d19
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/separate_vf_lr_and_optimizer.py
@@ -0,0 +1,135 @@
+"""Example of how to run any value function based algo (e.g. PPO) with 2 optimizers.
+
+One optimizer (with its own learning rate and other configurations) is responsible for
+updating the policy network, the other (with its own learning rate and other
+configurations) for updating the value function network.
+
+This example shows:
+    - how to subclass an existing (torch) Learner and override its
+    `configure_optimizers_for_module()` method.
+    - how to call `Learner.register_optimizer()` from within your custom
+    `configure_optimizers_for_module()` method in order to specify, which optimizer
+    (type, learning rate, other settings) is responsible for which neural network
+    parameters.
+    - how to add custom settings (here: the additional learning rate for the
+    vf-optimizer) to the `AlgorithmConfig` in order to not have to subclass and write
+    your own (you could still do that, but are not required to).
+    - how to plug in the custom Learner into your config and then run the
+    experiment.
+
+See the :py:class:`~ray.rllib.examples.learners.classes.separate_vf_lr_and_optimizer_learner.PPOTorchLearnerWithSeparateVfOptimizer`  # noqa
+class for details on how to override the main (torch) `configure_optimizers_for_module`
+function.
+
+We assume here that the users properly sets up their RLModule to have separate policy-
+and value function networks. If any model pieces are shared between the two optimizers,
+you should experience learning instability up to the point where your algorithm can't
+learn any useful policy anymore.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --lr-vf=0.001 --lr-policy=0.0005`
+
+Use the `--lr-policy` option to set the policy learning rate (used by the policy
+optimizer) and the `--lr-vf` option to set the value function learning rate (used by the
+value function optimizer).
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should expect to observe decent learning behavior from your console output:
+
+With --lr-vf=0.0005 and --lr-policy=0.001
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|                             |            |                 |        |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_7b404_00000 | TERMINATED | 127.0.0.1:16845 |     19 |
++-----------------------------+------------+-----------------+--------+
++------------------+------------------------+---------------------+
+|   total time (s) | num_env_steps_sampled_ | episode_return_mean |
+|                  |              _lifetime |                     |
+|------------------+------------------------+---------------------+
+|          19.4179 |                  76000 |              459.94 |
++------------------+------------------------+---------------------+
+"""
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.learners.classes.separate_vf_lr_and_optimizer_learner import (
+    PPOTorchLearnerWithSeparateVfOptimizer,
+)
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+torch, _ = try_import_torch()
+
+
+parser = add_rllib_example_script_args(default_reward=450.0)
+parser.set_defaults(enable_new_api_stack=True)
+parser.add_argument(
+    "--lr-vf",
+    type=float,
+    default=0.0005,
+    help="The learning rate used in the value function optimizer.",
+)
+parser.add_argument(
+    "--lr-policy",
+    type=float,
+    default=0.001,
+    help="The learning rate used in the policy optimizer.",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+    assert args.algo == "PPO", "Must set --algo=PPO when running this script!"
+
+    base_config = (
+        PPOConfig()
+        .environment("CartPole-v1")
+        .training(
+            # This is the most important setting in this script: We point our PPO
+            # algorithm to use the custom Learner (instead of the default
+            # PPOTorchLearner).
+            learner_class=PPOTorchLearnerWithSeparateVfOptimizer,
+            # We use this simple method here to inject a new setting that our
+            # custom Learner class uses in its `configure_optimizers_for_module`
+            # method. This is convenient and avoids having to subclass `PPOConfig` only
+            # to add a few new settings to it. Within our Learner, we can access this
+            # new setting through:
+            # `self.config.learner_config_dict['lr_vf']`
+            learner_config_dict={"lr_vf": args.lr_vf},
+            # Some settings to make this example learn better.
+            num_epochs=6,
+            # Since we are using separate optimizers for the two NN components, the
+            # value of `vf_loss_coeff` does not matter anymore. We set this to 1.0 here.
+            vf_loss_coeff=1.0,
+            # The policy learning rate, settable through the command line `--lr` arg.
+            lr=args.lr_policy,
+        )
+        .rl_module(
+            # Another very important setting is this here. Make sure you use
+            # completely separate NNs for policy and value-functions.
+            model_config=DefaultModelConfig(vf_share_layers=False),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..338b7e115c41286dd2ddae59a801d57eb6f4b022
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/cartpole_recording.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/cartpole_recording.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50cbb9d416c7f77693f1cabf7859804b36dec33b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/cartpole_recording.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/custom_input_api.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/custom_input_api.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff9c77c18542a42c5fd0e9c5686443df15015506
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/custom_input_api.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9192020b4ab9f9e6534f7bbc6e24524c65b3ef96
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl_with_image_data.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl_with_image_data.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91f967836a1553fc15426922b57a03884e4d2ef8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl_with_image_data.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/pretrain_bc_single_agent_evaluate_as_multi_agent.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/pretrain_bc_single_agent_evaluate_as_multi_agent.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c77868e595c8e5d126c1e7df9447b8d6f4480ef7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/pretrain_bc_single_agent_evaluate_as_multi_agent.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/saving_experiences.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/saving_experiences.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df24b9257002faf07a3f1a2e20bbd5b6171c2143
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/saving_experiences.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/train_w_bc_finetune_w_ppo.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/train_w_bc_finetune_w_ppo.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af6b9a73b4177be1d0918da1aeadd415b5a55529
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/train_w_bc_finetune_w_ppo.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_data.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_data.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..111ef7d48d2ee29b8bed3e6910beda10ad5e1a63
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_data.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_prelearner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_prelearner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e606c31a156bd30ab51159d098e33ce8a5a0fe4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_prelearner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_data.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f4ab5f5116fa3e625aa5440bb36ed28c4728e29
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_data.py
@@ -0,0 +1,73 @@
+import io
+import logging
+import numpy as np
+
+from PIL import Image
+from typing import Any, Dict
+
+from ray import data
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.offline.offline_data import OfflineData
+from ray.rllib.offline.offline_prelearner import OfflinePreLearner
+from ray.rllib.utils.annotations import override
+
+logger = logging.getLogger(__name__)
+
+
+class ImageOfflineData(OfflineData):
+    """This class overrides `OfflineData` to read in raw image data.
+
+    The image data is from Ray Data`s S3 example bucket, namely
+    `ray-example-data/batoidea/JPEGImages/`.
+    To read in this data the raw bytes have to be decoded and then
+    converted to `numpy` arrays. Each image array has a dimension
+    (32, 32, 3).
+
+    To just read in the raw image data and convert it to arrays it
+    suffices to override the `OfflineData.__init__` method only.
+    Note, that further transformations of the data - specifically
+    into `SingleAgentEpisode` data - will be performed in a custom
+    `OfflinePreLearner` defined in the `image_offline_prelearner`
+    file. You could hard-code the usage of this prelearner here,
+    but you will use the `prelearner_class` attribute in the
+    `AlgorithmConfig` instead.
+    """
+
+    @override(OfflineData)
+    def __init__(self, config: AlgorithmConfig):
+
+        # Set class attributes.
+        self.config = config
+        self.is_multi_agent = self.config.is_multi_agent
+        self.materialize_mapped_data = False
+        self.path = self.config.input_
+
+        self.data_read_batch_size = self.config.input_read_batch_size
+        self.data_is_mapped = False
+
+        # Define your function to map images to numpy arrays.
+        def map_to_numpy(row: Dict[str, Any]) -> Dict[str, Any]:
+            # Convert to byte stream.
+            bytes_stream = io.BytesIO(row["bytes"])
+            # Convert to image.
+            image = Image.open(bytes_stream)
+            # Return an array of the image.
+            return {"array": np.array(image)}
+
+        try:
+            # Load the dataset and transform to arrays on-the-fly.
+            self.data = data.read_binary_files(self.path).map(map_to_numpy)
+        except Exception as e:
+            logger.error(e)
+
+        # Define further attributes needed in the `sample` method.
+        self.batch_iterator = None
+        self.map_batches_kwargs = self.config.map_batches_kwargs
+        self.iter_batches_kwargs = self.config.iter_batches_kwargs
+        # Use a custom OfflinePreLearner if needed.
+        self.prelearner_class = self.config.prelearner_class or OfflinePreLearner
+
+        # For remote learner setups.
+        self.locality_hints = None
+        self.learner_handles = None
+        self.module_spec = None
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_prelearner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_prelearner.py
new file mode 100644
index 0000000000000000000000000000000000000000..001af304929ec2554fb6f3936ec788f42ab2f261
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_prelearner.py
@@ -0,0 +1,101 @@
+import gymnasium as gym
+import numpy as np
+import random
+import uuid
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ray.actor import ActorHandle
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+from ray.rllib.offline.offline_prelearner import OfflinePreLearner, SCHEMA
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType, ModuleID
+
+
+class ImageOfflinePreLearner(OfflinePreLearner):
+    """This class transforms image data to `MultiAgentBatch`es.
+
+    While the `ImageOfflineData` class transforms raw image
+    bytes to `numpy` arrays, this class maps these data in
+    `SingleAgentEpisode` instances through the learner connector
+    pipeline and finally outputs a >`MultiAgentBatch` ready for
+    training in RLlib's `Learner`s.
+
+    Note, the basic transformation from images to `SingleAgentEpisode`
+    instances creates synthetic data that does not rely on any MDP
+    and therefore no agent can learn from it. However, this example
+    should show how to transform data into this form through
+    overriding the `OfflinePreLearner`.
+    """
+
+    def __init__(
+        self,
+        config: "AlgorithmConfig",
+        learner: Union[Learner, List[ActorHandle]],
+        spaces: Optional[Tuple[gym.Space, gym.Space]] = None,
+        module_spec: Optional[MultiRLModuleSpec] = None,
+        module_state: Optional[Dict[ModuleID, Any]] = None,
+        **kwargs: Dict[str, Any],
+    ):
+        # Set up necessary class attributes.
+        self.config = config
+        self.action_space = spaces[1]
+        self.observation_space = spaces[0]
+        self.input_read_episodes = self.config.input_read_episodes
+        self.input_read_sample_batches = self.config.input_read_sample_batches
+        self._policies_to_train = "default_policy"
+        self._is_multi_agent = False
+
+        # Build the `MultiRLModule` needed for the learner connector.
+        self._module = module_spec.build()
+
+        # Build the learner connector pipeline.
+        self._learner_connector = self.config.build_learner_connector(
+            input_observation_space=self.observation_space,
+            input_action_space=self.action_space,
+        )
+
+    @override(OfflinePreLearner)
+    @staticmethod
+    def _map_to_episodes(
+        is_multi_agent: bool,
+        batch: Dict[str, Union[list, np.ndarray]],
+        schema: Dict[str, str] = SCHEMA,
+        to_numpy: bool = False,
+        input_compress_columns: Optional[List[str]] = None,
+        observation_space: gym.Space = None,
+        action_space: gym.Space = None,
+        **kwargs: Dict[str, Any],
+    ) -> Dict[str, List[EpisodeType]]:
+
+        # Define a container for the episodes.
+        episodes = []
+
+        # Batches come in as numpy arrays.
+        for i, obs in enumerate(batch["array"]):
+
+            # Construct your episode.
+            episode = SingleAgentEpisode(
+                id_=uuid.uuid4().hex,
+                observations=[obs, obs],
+                observation_space=observation_space,
+                actions=[action_space.sample()],
+                action_space=action_space,
+                rewards=[random.random()],
+                terminated=True,
+                truncated=False,
+                len_lookback_buffer=0,
+                t_started=0,
+            )
+
+            # Numpy'ize, if necessary.
+            if to_numpy:
+                episode.to_numpy()
+
+            # Store the episode in the container.
+            episodes.append(episode)
+
+        return {"episodes": episodes}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/custom_input_api.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/custom_input_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..789e64a2a357ee64069081cdf2d0a69ac975d5c7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/custom_input_api.py
@@ -0,0 +1,134 @@
+# @OldAPIStack
+
+"""Example of creating a custom input API
+
+Custom input apis are useful when your data source is in a custom format or
+when it is necessary to use an external data loading mechanism.
+In this example, we train an rl agent on user specified input data.
+Instead of using the built in JsonReader, we will create our own custom input
+api, and show how to pass config arguments to it.
+
+To train CQL on the pendulum environment:
+$ python custom_input_api.py --input-files=../tests/data/pendulum/enormous.zip
+"""
+
+import argparse
+import os
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.offline import JsonReader, ShuffledInput, IOContext, InputReader
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    EVALUATION_RESULTS,
+)
+from ray.tune.registry import get_trainable_cls, register_input
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--run", type=str, default="CQL", help="The RLlib-registered algorithm to use."
+)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument("--stop-iters", type=int, default=100)
+parser.add_argument(
+    "--input-files",
+    type=str,
+    default=os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        "../../tests/data/pendulum/small.json",
+    ),
+)
+
+
+class CustomJsonReader(JsonReader):
+    """
+    Example custom InputReader implementation (extended from JsonReader).
+
+    This gets wrapped in ShuffledInput to comply with offline rl algorithms.
+    """
+
+    def __init__(self, ioctx: IOContext):
+        """
+        The constructor must take an IOContext to be used in the input config.
+        Args:
+            ioctx: use this to access the `input_config` arguments.
+        """
+        super().__init__(ioctx.input_config["input_files"], ioctx)
+
+
+def input_creator(ioctx: IOContext) -> InputReader:
+    """
+    The input creator method can be used in the input registry or set as the
+    config["input"] parameter.
+
+    Args:
+        ioctx: use this to access the `input_config` arguments.
+
+    Returns:
+        instance of ShuffledInput to work with some offline rl algorithms
+    """
+    return ShuffledInput(CustomJsonReader(ioctx))
+
+
+if __name__ == "__main__":
+    ray.init()
+    args = parser.parse_args()
+
+    # make absolute path because relative path looks in result directory
+    args.input_files = os.path.abspath(args.input_files)
+
+    # we register our custom input creator with this convenient function
+    register_input("custom_input", input_creator)
+
+    # Config modified from rllib/tuned_examples/cql/pendulum-cql.yaml
+    default_config = get_trainable_cls(args.run).get_default_config()
+    config = (
+        default_config.environment("Pendulum-v1", clip_actions=True)
+        .framework(args.framework)
+        .offline_data(
+            # We can either use the tune registry ...
+            input_="custom_input",
+            # ... full classpath
+            # input_: "ray.rllib.examples.offline_rl.custom_input_api.CustomJsonReader"
+            # ... or a direct function to connect our input api.
+            # input_: input_creator
+            input_config={"input_files": args.input_files},  # <- passed to IOContext
+            actions_in_input_normalized=True,
+        )
+        .training(train_batch_size=2000)
+        .evaluation(
+            evaluation_interval=1,
+            evaluation_num_env_runners=2,
+            evaluation_duration=10,
+            evaluation_parallel_to_training=True,
+            evaluation_config=default_config.overrides(
+                input_="sampler",
+                explore=False,
+            ),
+        )
+        .reporting(metrics_num_episodes_for_smoothing=5)
+    )
+
+    if args.run == "CQL":
+        config.training(
+            twin_q=True,
+            num_steps_sampled_before_learning_starts=0,
+            bc_iters=100,
+        )
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -600,
+    }
+
+    tuner = tune.Tuner(
+        args.run, param_space=config, run_config=air.RunConfig(stop=stop, verbose=1)
+    )
+    tuner.fit()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5679fc1ac63b3eb403864ef94d1245736623388b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl.py
@@ -0,0 +1,167 @@
+# @OldAPIStack
+
+"""Example on how to use CQL to learn from an offline JSON file.
+
+Important node: Make sure that your offline data file contains only
+a single timestep per line to mimic the way SAC pulls samples from
+the buffer.
+
+Generate the offline json file by running an SAC algo until it reaches expert
+level on your command line. For example:
+$ cd ray
+$ rllib train -f rllib/tuned_examples/sac/pendulum-sac.yaml --no-ray-ui
+
+Also make sure that in the above SAC yaml file (pendulum-sac.yaml),
+you specify an additional "output" key with any path on your local
+file system. In that path, the offline json files will be written to.
+
+Use the generated file(s) as "input" in the CQL config below
+(`config["input"] = [list of your json files]`), then run this script.
+"""
+
+import argparse
+import numpy as np
+
+from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch
+from ray.rllib.algorithms import cql as cql
+from ray.rllib.execution.rollout_ops import (
+    synchronous_parallel_sample,
+)
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    EVALUATION_RESULTS,
+)
+
+torch, _ = try_import_torch()
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=5, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-reward", type=float, default=50.0, help="Reward at which we stop training."
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # See rllib/tuned_examples/cql/pendulum-cql.yaml for comparison.
+    config = (
+        cql.CQLConfig()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .framework(framework="torch")
+        .env_runners(num_env_runners=0)
+        .training(
+            n_step=3,
+            bc_iters=0,
+            clip_actions=False,
+            tau=0.005,
+            target_entropy="auto",
+            q_model_config={
+                "fcnet_hiddens": [256, 256],
+                "fcnet_activation": "relu",
+            },
+            policy_model_config={
+                "fcnet_hiddens": [256, 256],
+                "fcnet_activation": "relu",
+            },
+            optimization_config={
+                "actor_learning_rate": 3e-4,
+                "critic_learning_rate": 3e-4,
+                "entropy_learning_rate": 3e-4,
+            },
+            train_batch_size=256,
+            target_network_update_freq=1,
+            num_steps_sampled_before_learning_starts=256,
+        )
+        .reporting(min_train_timesteps_per_iteration=1000)
+        .debugging(log_level="INFO")
+        .environment("Pendulum-v1", normalize_actions=True)
+        .offline_data(
+            input_config={
+                "paths": ["tests/data/pendulum/enormous.zip"],
+                "format": "json",
+            }
+        )
+        .evaluation(
+            evaluation_num_env_runners=1,
+            evaluation_interval=1,
+            evaluation_duration=10,
+            evaluation_parallel_to_training=False,
+            evaluation_config=cql.CQLConfig.overrides(input_="sampler"),
+        )
+    )
+    # evaluation_parallel_to_training should be False b/c iterations are very long
+    # and this would cause evaluation to lag one iter behind training.
+
+    # Check, whether we can learn from the given file in `num_iterations`
+    # iterations, up to a reward of `min_reward`.
+    num_iterations = 5
+    min_reward = -300
+
+    cql_algorithm = cql.CQL(config=config)
+    learnt = False
+    for i in range(num_iterations):
+        print(f"Iter {i}")
+        eval_results = cql_algorithm.train().get(EVALUATION_RESULTS)
+        if eval_results:
+            print(
+                "... R={}".format(eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])
+            )
+            # Learn until some reward is reached on an actual live env.
+            if eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_reward:
+                # Test passed gracefully.
+                if args.as_test:
+                    print("Test passed after {} iterations.".format(i))
+                    quit(0)
+                learnt = True
+                break
+
+    # Get policy and model.
+    cql_policy = cql_algorithm.get_policy()
+    cql_model = cql_policy.model
+
+    # If you would like to query CQL's learnt Q-function for arbitrary
+    # (cont.) actions, do the following:
+    obs_batch = torch.from_numpy(np.random.random(size=(5, 3)))
+    action_batch = torch.from_numpy(np.random.random(size=(5, 1)))
+    q_values = cql_model.get_q_values(obs_batch, action_batch)[0]
+    # If you are using the "twin_q", there'll be 2 Q-networks and
+    # we usually consider the min of the 2 outputs, like so:
+    twin_q_values = cql_model.get_twin_q_values(obs_batch, action_batch)[0]
+    final_q_values = torch.min(q_values, twin_q_values)[0]
+    print(f"final_q_values={final_q_values.detach().numpy()}")
+
+    # Example on how to do evaluation on the trained Algorithm.
+    # using the data from our buffer.
+    # Get a sample (MultiAgentBatch).
+
+    batch = synchronous_parallel_sample(worker_set=cql_algorithm.env_runner_group)
+    batch = convert_ma_batch_to_sample_batch(batch)
+    obs = torch.from_numpy(batch["obs"])
+    # Pass the observations through our model to get the
+    # features, which then to pass through the Q-head.
+    model_out, _ = cql_model({"obs": obs})
+    # The estimated Q-values from the (historic) actions in the batch.
+    q_values_old = cql_model.get_q_values(
+        model_out, torch.from_numpy(batch["actions"])
+    )[0]
+    # The estimated Q-values for the new actions computed by our policy.
+    actions_new = cql_policy.compute_actions_from_input_dict({"obs": obs})[0]
+    q_values_new = cql_model.get_q_values(model_out, torch.from_numpy(actions_new))[0]
+    print(f"Q-val batch={q_values_old.detach().numpy()}")
+    print(f"Q-val policy={q_values_new.detach().numpy()}")
+
+    cql_algorithm.stop()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3e52b35526dcd95abdc4efd39769b3d86a8a5b2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py
@@ -0,0 +1,300 @@
+"""Example of training a custom RLModule with BC first, then finetuning it with PPO.
+
+This example:
+    - demonstrates how to write a very simple custom BC RLModule.
+    - run a quick BC training experiment with the custom module and learn CartPole
+    until some episode return A, while checkpointing each iteration.
+    - shows how subclass the custom BC RLModule, add the ValueFunctionAPI to the
+    new class, and add a value-function branch and an implementation of
+    `compute_values` to the original model to make it work with a value-based algo
+    like PPO.
+    - shows how to plug this new PPO-capable RLModule (including its checkpointed state
+    from the BC run) into your algorithm's config.
+    - confirms that even after 1-2 training iterations with PPO, no catastrophic
+    forgetting occurs (due to the additional value function branch and the switched
+    optimizer).
+    - uses Tune and RLlib to continue training the model until a higher return of B
+    is reached.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can first see BC's performance until return A is reached:
++----------------------------+------------+----------------+--------+
+| Trial name                 | status     | loc            |   iter |
+|                            |            |                |        |
+|----------------------------+------------+----------------+--------+
+| BC_CartPole-v1_95ba0_00000 | TERMINATED | 127.0.0.1:1515 |     51 |
++----------------------------+------------+----------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |    episode_return_mean |   num_env_steps_traine |
+|                  |                        |             d_lifetime |
+|------------------+------------------------|------------------------|
+|          11.4828 |                  250.5 |                  42394 |
++------------------+------------------------+------------------------+
+
+The script should confirm that no catastrophic forgetting has taken place:
+
+PPO return after initialization: 292.3
+PPO return after 2x training: 276.85
+
+Then, after PPO training, you should see something like this (higher return):
++-----------------------------+------------+----------------+--------+
+| Trial name                  | status     | loc            |   iter |
+|                             |            |                |        |
+|-----------------------------+------------+----------------+--------+
+| PPO_CartPole-v1_e07ac_00000 | TERMINATED | 127.0.0.1:6032 |     37 |
++-----------------------------+------------+----------------+--------+
+
++------------------+------------------------+------------------------+
+|   total time (s) |    episode_return_mean |  num_episodes_lifetime |
+|                  |                        |                        |
++------------------+------------------------+------------------------+
+|          32.7647 |                 450.76 |                    406 |
++------------------+------------------------+------------------------+
+"""
+from pathlib import Path
+
+from torch import nn
+
+from ray.rllib.algorithms.bc import BCConfig
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core import (
+    COMPONENT_LEARNER_GROUP,
+    COMPONENT_LEARNER,
+    COMPONENT_RL_MODULE,
+)
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.models.base import ENCODER_OUT
+from ray.rllib.core.models.configs import MLPEncoderConfig, MLPHeadConfig
+from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI
+from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleSpec
+from ray.rllib.core.rl_module.torch import TorchRLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    EVALUATION_RESULTS,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+parser = add_rllib_example_script_args()
+parser.set_defaults(
+    enable_new_api_stack=True,
+    env="CartPole-v1",
+    checkpoint_freq=1,
+)
+
+
+class MyBCModel(TorchRLModule):
+    """A very simple BC-usable model that only computes action logits."""
+
+    @override(TorchRLModule)
+    def setup(self):
+        # Create an encoder trunk.
+        # Observations are directly passed through it and feature vectors are output.
+        self._encoder = MLPEncoderConfig(
+            input_dims=[4],  # CartPole
+            hidden_layer_dims=[256, 256],
+            hidden_layer_activation="relu",
+            output_layer_dim=None,
+        ).build(framework="torch")
+
+        # The policy head sitting on top of the encoder. Feature vectors come in as
+        # input and action logits are output.
+        self._pi = MLPHeadConfig(
+            input_dims=[256],  # from encoder
+            hidden_layer_dims=[256],  # pi head
+            hidden_layer_activation="relu",
+            output_layer_dim=2,  # CartPole
+            output_layer_activation="linear",
+        ).build(framework="torch")
+
+    @override(TorchRLModule)
+    def _forward_inference(self, batch, **kwargs):
+        return {Columns.ACTION_DIST_INPUTS: self._pi(self._encoder(batch)[ENCODER_OUT])}
+
+    @override(RLModule)
+    def _forward_exploration(self, batch, **kwargs):
+        return self._forward_inference(batch)
+
+    @override(RLModule)
+    def _forward_train(self, batch, **kwargs):
+        return self._forward_inference(batch)
+
+
+class MyPPOModel(MyBCModel, ValueFunctionAPI):
+    """Subclass of our simple BC model, but implementing the ValueFunctionAPI.
+
+    Implementing the `compute_values` method makes this RLModule usable by algos
+    like PPO.
+    """
+
+    @override(MyBCModel)
+    def setup(self):
+        # Call super setup to create encoder trunk and policy head.
+        super().setup()
+        # Create the new value function head and zero-initialize it to not cause too
+        # much disruption.
+        self._vf = MLPHeadConfig(
+            input_dims=[256],  # from encoder
+            hidden_layer_dims=[256],  # pi head
+            hidden_layer_activation="relu",
+            hidden_layer_weights_initializer=nn.init.zeros_,
+            hidden_layer_bias_initializer=nn.init.zeros_,
+            output_layer_dim=1,  # 1=value node
+            output_layer_activation="linear",
+            output_layer_weights_initializer=nn.init.zeros_,
+            output_layer_bias_initializer=nn.init.zeros_,
+        ).build(framework="torch")
+
+    @override(MyBCModel)
+    def _forward_train(self, batch, **kwargs):
+        features = self._encoder(batch)[ENCODER_OUT]
+        logits = self._pi(features)
+        vf_out = self._vf(features).squeeze(-1)
+        return {
+            Columns.ACTION_DIST_INPUTS: logits,
+            Columns.VF_PREDS: vf_out,
+        }
+
+    @override(ValueFunctionAPI)
+    def compute_values(self, batch, embeddings=None):
+        # Compute embeddings ...
+        if embeddings is None:
+            embeddings = self._encoder(batch)[ENCODER_OUT]
+        # then values using our value head.
+        return self._vf(embeddings).squeeze(-1)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.env == "CartPole-v1", "This example works only with --env=CartPole-v1!"
+
+    # Define the data paths for our CartPole large dataset.
+    base_path = Path(__file__).parents[2]
+    assert base_path.is_dir(), base_path
+    data_path = base_path / "tests/data/cartpole/cartpole-v1_large"
+    assert data_path.is_dir(), data_path
+    print(f"data_path={data_path}")
+
+    # Define the BC config.
+    base_config = (
+        BCConfig()
+        # Note, the `input_` argument is the major argument for the
+        # new offline API. Via the `input_read_method_kwargs` the
+        # arguments for the `ray.data.Dataset` read method can be
+        # configured. The read method needs at least as many blocks
+        # as remote learners.
+        .offline_data(
+            input_=[data_path.as_posix()],
+            # Define the number of reading blocks, these should be larger than 1
+            # and aligned with the data size.
+            input_read_method_kwargs={
+                "override_num_blocks": max((args.num_learners or 1) * 2, 2)
+            },
+            # Concurrency defines the number of processes that run the
+            # `map_batches` transformations. This should be aligned with the
+            # 'prefetch_batches' argument in 'iter_batches_kwargs'.
+            map_batches_kwargs={"concurrency": 2, "num_cpus": 2},
+            # This data set is small so do not prefetch too many batches and use no
+            # local shuffle.
+            iter_batches_kwargs={
+                "prefetch_batches": 1,
+                "local_shuffle_buffer_size": None,
+            },
+            # The number of iterations to be run per learner when in multi-learner
+            # mode in a single RLlib training iteration. Leave this to `None` to
+            # run an entire epoch on the dataset during a single RLlib training
+            # iteration. For single-learner mode, 1 is the only option.
+            dataset_num_iters_per_learner=1 if not args.num_learners else None,
+        ).training(
+            train_batch_size_per_learner=1024,
+            # To increase learning speed with multiple learners,
+            # increase the learning rate correspondingly.
+            lr=0.0008 * (args.num_learners or 1) ** 0.5,
+        )
+        # Plug in our simple custom BC model from above.
+        .rl_module(rl_module_spec=RLModuleSpec(module_class=MyBCModel))
+        # Run evaluation to observe how good our BC policy already is.
+        .evaluation(
+            evaluation_interval=3,
+            evaluation_num_env_runners=1,
+            evaluation_duration=5,
+            evaluation_parallel_to_training=True,
+        )
+    )
+
+    # Run the BC experiment and stop at R=250.0
+    metric_key = f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    stop = {metric_key: 250.0}
+    results = run_rllib_example_script_experiment(base_config, args, stop=stop)
+
+    # Extract the RLModule checkpoint.
+    best_result = results.get_best_result(metric_key)
+    rl_module_checkpoint = (
+        Path(best_result.checkpoint.path)
+        / COMPONENT_LEARNER_GROUP
+        / COMPONENT_LEARNER
+        / COMPONENT_RL_MODULE
+        / "default_policy"
+    )
+
+    # Create a new PPO config.
+    base_config = (
+        PPOConfig()
+        .environment(args.env)
+        .training(
+            # Keep lr relatively low at the beginning to avoid catastrophic forgetting.
+            lr=0.00002,
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+        )
+        # Plug in our simple custom PPO model from above. Note that the checkpoint
+        # for the BC model is loadable into the PPO model, b/c the BC model is a subset
+        # of the PPO model (all weights/biases in the BC model are also found in the PPO
+        # model; the PPO model only has an additional value function branch).
+        .rl_module(
+            rl_module_spec=RLModuleSpec(
+                module_class=MyPPOModel,
+                load_state_path=rl_module_checkpoint,
+            )
+        )
+    )
+
+    # Quick test, whether initial performance in the loaded (now PPO) model is ok.
+    ppo = base_config.build()
+    eval_results = ppo.evaluate()
+    R = eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    assert R >= 200.0, f"Initial PPO performance bad! R={R} (expected 200.0+)."
+    print(f"PPO return after initialization: {R}")
+    # Check, whether training 2 times causes catastrophic forgetting.
+    ppo.train()
+    train_results = ppo.train()
+    R = train_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    assert R >= 250.0, f"PPO performance (training) bad! R={R} (expected 250.0+)."
+    print(f"PPO return after 2x training: {R}")
+
+    # Perform actual PPO training run (this time until 450.0 return).
+    stop = {
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 450.0,
+    }
+    run_rllib_example_script_experiment(base_config, args, stop=stop)