diff --git a/.gitattributes b/.gitattributes
index ec645e70edaa1c0663dd5b1921f3bf52b115fb6d..db23df24a51b016a716ec6e56057a782e5fe1690 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -177,3 +177,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e74256f183ede0d05050c27d51e7043e30735fa0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c834df6257a1158af124427e411086f3fcc8eb2ea4c080f29143c4a418c67c
+size 250369
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..861b915146949dbe03a74f7abf04d8cd0a809b9f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/vpg_custom_algorithm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/vpg_custom_algorithm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba41abdb5149b5306e94dd30eb87f92bc78326c0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/vpg_custom_algorithm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d81106dd0142bf183a063e9e536248cb6f79d8dc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/vpg.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/vpg.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e0b09b41b4b9a05b1a5f1f2d824e3a6eb641769
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/vpg.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/vpg.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/vpg.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a44122557f49ce900e13feb0161bbb065f6c655
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/vpg.py
@@ -0,0 +1,176 @@
+import tree  # pip install dm_tree
+
+from ray.rllib.algorithms import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    ENV_RUNNER_SAMPLING_TIMER,
+    LEARNER_RESULTS,
+    LEARNER_UPDATE_TIMER,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    SYNCH_WORKER_WEIGHTS_TIMER,
+    TIMERS,
+)
+
+
+class VPGConfig(AlgorithmConfig):
+    """A simple VPG (vanilla policy gradient) algorithm w/o value function support.
+
+    Use for testing purposes only!
+
+    This Algorithm should use the VPGTorchLearner and VPGTorchRLModule
+    """
+
+    # A test setting to activate metrics on mean weights.
+    report_mean_weights: bool = True
+
+    def __init__(self, algo_class=None):
+        super().__init__(algo_class=algo_class or VPG)
+
+        # VPG specific settings.
+        self.num_episodes_per_train_batch = 10
+        # Note that we don't have to set this here, because we tell the EnvRunners
+        # explicitly to sample entire episodes. However, for good measure, we change
+        # this setting here either way.
+        self.batch_mode = "complete_episodes"
+
+        # VPG specific defaults (from AlgorithmConfig).
+        self.num_env_runners = 1
+
+    @override(AlgorithmConfig)
+    def training(
+        self, *, num_episodes_per_train_batch=NotProvided, **kwargs
+    ) -> "VPGConfig":
+        """Sets the training related configuration.
+
+        Args:
+            num_episodes_per_train_batch: The number of complete episodes per train
+                batch. VPG requires entire episodes to be sampled from the EnvRunners.
+                For environments with varying episode lengths, this leads to varying
+                batch sizes (in timesteps) as well possibly causing slight learning
+                instabilities. However, for simplicity reasons, we stick to collecting
+                always exactly n episodes per training update.
+
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+
+        if num_episodes_per_train_batch is not NotProvided:
+            self.num_episodes_per_train_batch = num_episodes_per_train_batch
+
+        return self
+
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self):
+        if self.framework_str == "torch":
+            from ray.rllib.examples.rl_modules.classes.vpg_torch_rlm import (
+                VPGTorchRLModule,
+            )
+
+            spec = RLModuleSpec(
+                module_class=VPGTorchRLModule,
+                model_config={"hidden_dim": 64},
+            )
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework_str}")
+
+        return spec
+
+    @override(AlgorithmConfig)
+    def get_default_learner_class(self):
+        if self.framework_str == "torch":
+            from ray.rllib.examples.learners.classes.vpg_torch_learner import (
+                VPGTorchLearner,
+            )
+
+            return VPGTorchLearner
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework_str}")
+
+
+class VPG(Algorithm):
+    @classmethod
+    @override(Algorithm)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return VPGConfig()
+
+    @override(Algorithm)
+    def training_step(self) -> None:
+        """Override of the training_step method of `Algorithm`.
+
+        Runs the following steps per call:
+        - Sample B timesteps (B=train batch size). Note that we don't sample complete
+        episodes due to simplicity. For an actual VPG algo, due to the loss computation,
+        you should always sample only completed episodes.
+        - Send the collected episodes to the VPG LearnerGroup for model updating.
+        - Sync the weights from LearnerGroup to all EnvRunners.
+        """
+        # Sample.
+        with self.metrics.log_time((TIMERS, ENV_RUNNER_SAMPLING_TIMER)):
+            episodes, env_runner_results = self._sample_episodes()
+        # Merge results from n parallel sample calls into self's metrics logger.
+        self.metrics.merge_and_log_n_dicts(env_runner_results, key=ENV_RUNNER_RESULTS)
+
+        # Just for demonstration purposes, log the number of time steps sampled in this
+        # `training_step` round.
+        # Mean over a window of 100:
+        self.metrics.log_value(
+            "episode_timesteps_sampled_mean_win100",
+            sum(map(len, episodes)),
+            reduce="mean",
+            window=100,
+        )
+        # Exponential Moving Average (EMA) with coeff=0.1:
+        self.metrics.log_value(
+            "episode_timesteps_sampled_ema",
+            sum(map(len, episodes)),
+            ema_coeff=0.1,  # <- weight of new value; weight of old avg=1.0-ema_coeff
+        )
+
+        # Update model.
+        with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
+            learner_results = self.learner_group.update_from_episodes(
+                episodes=episodes,
+                timesteps={
+                    NUM_ENV_STEPS_SAMPLED_LIFETIME: (
+                        self.metrics.peek(
+                            (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME)
+                        )
+                    ),
+                },
+            )
+        # Merge results from m parallel update calls into self's metrics logger.
+        self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
+
+        # Sync weights.
+        with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
+            self.env_runner_group.sync_weights(
+                from_worker_or_learner_group=self.learner_group,
+                inference_only=True,
+            )
+
+    def _sample_episodes(self):
+        # How many episodes to sample from each EnvRunner?
+        num_episodes_per_env_runner = self.config.num_episodes_per_train_batch // (
+            self.config.num_env_runners or 1
+        )
+        # Send parallel remote requests to sample and get the metrics.
+        sampled_data = self.env_runner_group.foreach_env_runner(
+            # Return tuple of [episodes], [metrics] from each EnvRunner.
+            lambda env_runner: (
+                env_runner.sample(num_episodes=num_episodes_per_env_runner),
+                env_runner.get_metrics(),
+            ),
+            # Loop over remote EnvRunners' `sample()` method in parallel or use the
+            # local EnvRunner if there aren't any remote ones.
+            local_env_runner=self.env_runner_group.num_remote_workers() <= 0,
+        )
+        # Return one list of episodes and a list of metrics dicts (one per EnvRunner).
+        episodes = tree.flatten([s[0] for s in sampled_data])
+        stats_dicts = [s[1] for s in sampled_data]
+
+        return episodes, stats_dicts
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/vpg_custom_algorithm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/vpg_custom_algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dbc259204b0a87c8a5bf818140f1c59c6995e1c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/vpg_custom_algorithm.py
@@ -0,0 +1,117 @@
+"""Example of how to write a custom Algorithm.
+
+This is an end-to-end example for how to implement a custom Algorithm, including
+a matching AlgorithmConfig class and Learner class. There is no particular RLModule API
+needed for this algorithm, which means that any TorchRLModule returning actions
+or action distribution parameters suffices.
+
+The RK algorithm implemented here is "vanilla policy gradient" (VPG) in its simplest
+form, without a value function baseline.
+
+See the actual VPG algorithm class here:
+https://github.com/ray-project/ray/blob/master/rllib/examples/algorithms/classes/vpg.py
+
+The Learner class the algorithm uses by default (if the user doesn't specify a custom
+Learner):
+https://github.com/ray-project/ray/blob/master/rllib/examples/learners/classes/vpg_torch_learner.py  # noqa
+
+And the RLModule class the algorithm uses by default (if the user doesn't specify a
+custom RLModule):
+https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/classes/vpg_torch_rlm.py  # noqa
+
+This example shows:
+    - how to subclass the AlgorithmConfig base class to implement a custom algorithm's.
+    config class.
+    - how to subclass the Algorithm base class to implement a custom Algorithm,
+    including its `training_step` method.
+    - how to subclass the TorchLearner base class to implement a custom Learner with
+    loss function, overriding `compute_loss_for_module` and
+    `after_gradient_based_update`.
+    - how to define a default RLModule used by the algorithm in case the user
+    doesn't bring their own custom RLModule. The VPG algorithm doesn't require any
+    specific RLModule APIs, so any RLModule returning actions or action distribution
+    inputs suffices.
+
+We compute a plain policy gradient loss without value function baseline.
+The experiment shows that even with such a simple setup, our custom algorithm is still
+able to successfully learn CartPole-v1.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+With some fine-tuning of the learning rate, the batch size, and maybe the
+number of env runners and number of envs per env runner, you should see decent
+learning behavior on the CartPole-v1 environment:
+
++-----------------------------+------------+--------+------------------+
+| Trial name                  | status     |   iter |   total time (s) |
+|                             |            |        |                  |
+|-----------------------------+------------+--------+------------------+
+| VPG_CartPole-v1_2973e_00000 | TERMINATED |    451 |          59.5184 |
++-----------------------------+------------+--------+------------------+
++-----------------------+------------------------+------------------------+
+|   episode_return_mean |   num_env_steps_sample |   ...env_steps_sampled |
+|                       |             d_lifetime |   _lifetime_throughput |
+|-----------------------+------------------------+------------------------|
+|                250.52 |                 415787 |                7428.98 |
++-----------------------+------------------------+------------------------+
+"""
+
+from ray.rllib.examples.algorithms.classes.vpg import VPGConfig
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+
+parser = add_rllib_example_script_args(
+    default_reward=250.0,
+    default_iters=1000,
+    default_timesteps=750000,
+)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        VPGConfig()
+        .environment("CartPole-v1")
+        .training(
+            # The only VPG-specific setting. How many episodes per train batch?
+            num_episodes_per_train_batch=10,
+            # Set other config parameters.
+            lr=0.0005,
+            # Note that you don't have to set any specific Learner class, because
+            # our custom Algorithm already defines the default Learner class to use
+            # through its `get_default_learner_class` method, which returns
+            # `VPGTorchLearner`.
+            # learner_class=VPGTorchLearner,
+        )
+        # Increase the number of EnvRunners (default is 1 for VPG)
+        # or the number of envs per EnvRunner.
+        .env_runners(num_env_runners=2, num_envs_per_env_runner=1)
+        # Plug in your own RLModule class. VPG doesn't require any specific
+        # RLModule APIs, so any RLModule returning `actions` or `action_dist_inputs`
+        # from the forward methods works ok.
+        # .rl_module(
+        #    rl_module_spec=RLModuleSpec(module_class=...),
+        # )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/continue_training_from_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/continue_training_from_checkpoint.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e728590f4a21211ddbb8e239f728ee7225330025
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/continue_training_from_checkpoint.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_tf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2e65bee88094283bec310249659ca46d183ce56
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_tf.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbccae42965609919847c0d7ae95f0ae2fb04f2b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch_lstm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch_lstm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c24d241918419ce991493879e7f8992a7320d6c0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch_lstm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/cartpole_dqn_export.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/cartpole_dqn_export.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a623d012d9a1493d9cdd5481ad91e1932eae9d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/cartpole_dqn_export.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+# @OldAPIStack
+
+import numpy as np
+import os
+import ray
+
+from ray.rllib.policy.policy import Policy
+from ray.rllib.utils.framework import try_import_tf
+from ray.tune.registry import get_trainable_cls
+
+tf1, tf, tfv = try_import_tf()
+
+ray.init()
+
+
+def train_and_export_policy_and_model(algo_name, num_steps, model_dir, ckpt_dir):
+    cls = get_trainable_cls(algo_name)
+    config = cls.get_default_config()
+    config.api_stack(
+        enable_rl_module_and_learner=False,
+        enable_env_runner_and_connector_v2=False,
+    )
+    # This Example is only for tf.
+    config.framework("tf")
+    # Set exporting native (DL-framework) model files to True.
+    config.export_native_model_files = True
+    config.env = "CartPole-v1"
+    alg = config.build()
+    for _ in range(num_steps):
+        alg.train()
+
+    # Export Policy checkpoint.
+    alg.export_policy_checkpoint(ckpt_dir)
+    # Export tensorflow keras Model for online serving
+    alg.export_policy_model(model_dir)
+
+
+def restore_saved_model(export_dir):
+    signature_key = (
+        tf1.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    )
+    g = tf1.Graph()
+    with g.as_default():
+        with tf1.Session(graph=g) as sess:
+            meta_graph_def = tf1.saved_model.load(
+                sess, [tf1.saved_model.tag_constants.SERVING], export_dir
+            )
+            print("Model restored!")
+            print("Signature Def Information:")
+            print(meta_graph_def.signature_def[signature_key])
+            print("You can inspect the model using TensorFlow SavedModel CLI.")
+            print("https://www.tensorflow.org/guide/saved_model")
+
+
+def restore_policy_from_checkpoint(export_dir):
+    # Load the model from the checkpoint.
+    policy = Policy.from_checkpoint(export_dir)
+    # Perform a dummy (CartPole) forward pass.
+    test_obs = np.array([0.1, 0.2, 0.3, 0.4])
+    results = policy.compute_single_action(test_obs)
+    # Check results for correctness.
+    assert len(results) == 3
+    assert results[0].shape == ()  # pure single action (int)
+    assert results[1] == []  # RNN states
+    assert results[2]["action_dist_inputs"].shape == (2,)  # categorical inputs
+
+
+if __name__ == "__main__":
+    algo = "PPO"
+    model_dir = os.path.join(ray._private.utils.get_user_temp_dir(), "model_export_dir")
+    ckpt_dir = os.path.join(ray._private.utils.get_user_temp_dir(), "ckpt_export_dir")
+    num_steps = 1
+    train_and_export_policy_and_model(algo, num_steps, model_dir, ckpt_dir)
+    restore_saved_model(model_dir)
+    restore_policy_from_checkpoint(ckpt_dir)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/change_config_during_training.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/change_config_during_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..f42ad6e79afbbf02336586687529d17fa1fa083b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/change_config_during_training.py
@@ -0,0 +1,246 @@
+"""Example showing how to continue training an Algorithm with a changed config.
+
+Use the setup shown in this script if you want to continue a prior experiment, but
+would also like to change some of the config values you originally used.
+
+This example:
+    - runs a single- or multi-agent CartPole experiment (for multi-agent, we use
+    different learning rates) thereby checkpointing the state of the Algorithm every n
+    iterations. The config used is hereafter called "1st config".
+    - stops the experiment due to some episode return being achieved.
+    - just for testing purposes, restores the entire algorithm from the latest
+    checkpoint and checks, whether the state of the restored algo exactly match the
+    state of the previously saved one.
+    - then changes the original config used (learning rate and other settings) and
+    continues training with the restored algorithm and the changed config until a
+    final episode return is reached. The new config is hereafter called "2nd config".
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=[0 or 2]
+--stop-reward-first-config=[return at which the algo on 1st config should stop training]
+--stop-reward=[the final return to achieve after restoration from the checkpoint with
+the 2nd config]
+`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+First, you should see the initial tune.Tuner do it's thing:
+
+Trial status: 1 RUNNING
+Current time: 2024-06-03 12:03:39. Total running time: 30s
+Logical resource usage: 3.0/12 CPUs, 0/0 GPUs
+╭────────────────────────────────────────────────────────────────────────
+│ Trial name                    status       iter     total time (s)
+├────────────────────────────────────────────────────────────────────────
+│ PPO_CartPole-v1_7b1eb_00000   RUNNING         6             16.265
+╰────────────────────────────────────────────────────────────────────────
+───────────────────────────────────────────────────────────────────────╮
+..._sampled_lifetime     ..._trained_lifetime     ...episodes_lifetime │
+───────────────────────────────────────────────────────────────────────┤
+               24000                    24000                      340 │
+───────────────────────────────────────────────────────────────────────╯
+...
+
+The experiment stops at an average episode return of `--stop-reward-first-config`.
+
+After the validation of the last checkpoint, a new experiment is started from
+scratch, but with the RLlib callback restoring the Algorithm right after
+initialization using the previous checkpoint. This new experiment then runs
+until `--stop-reward` is reached.
+
+Trial status: 1 RUNNING
+Current time: 2024-06-03 12:05:00. Total running time: 1min 0s
+Logical resource usage: 3.0/12 CPUs, 0/0 GPUs
+╭────────────────────────────────────────────────────────────────────────
+│ Trial name                    status       iter     total time (s)
+├────────────────────────────────────────────────────────────────────────
+│ PPO_CartPole-v1_7b1eb_00000   RUNNING        23            14.8372
+╰────────────────────────────────────────────────────────────────────────
+───────────────────────────────────────────────────────────────────────╮
+..._sampled_lifetime     ..._trained_lifetime     ...episodes_lifetime │
+───────────────────────────────────────────────────────────────────────┤
+              109078                   109078                      531 │
+───────────────────────────────────────────────────────────────────────╯
+
+And if you are using the `--as-test` option, you should see a finel message:
+
+```
+`env_runners/episode_return_mean` of 450.0 reached! ok
+```
+"""
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    LEARNER_RESULTS,
+)
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    check,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import register_env
+
+
+parser = add_rllib_example_script_args(
+    default_reward=450.0, default_timesteps=10000000, default_iters=2000
+)
+parser.add_argument(
+    "--stop-reward-first-config",
+    type=float,
+    default=150.0,
+    help="Mean episode return after which the Algorithm on the first config should "
+    "stop training.",
+)
+# By default, set `args.checkpoint_freq` to 1 and `args.checkpoint_at_end` to True.
+parser.set_defaults(
+    enable_new_api_stack=True,
+    checkpoint_freq=1,
+    checkpoint_at_end=True,
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    register_env(
+        "ma_cart", lambda cfg: MultiAgentCartPole({"num_agents": args.num_agents})
+    )
+
+    # Simple generic config.
+    base_config = (
+        PPOConfig()
+        .environment("CartPole-v1" if args.num_agents == 0 else "ma_cart")
+        .training(lr=0.0001)
+        # TODO (sven): Tune throws a weird error inside the "log json" callback
+        #  when running with this option. The `perf` key in the result dict contains
+        #  binary data (instead of just 2 float values for mem and cpu usage).
+        # .experimental(_use_msgpack_checkpoints=True)
+    )
+
+    # Setup multi-agent, if required.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={
+                f"p{aid}": PolicySpec(
+                    config=AlgorithmConfig.overrides(
+                        lr=5e-5
+                        * (aid + 1),  # agent 1 has double the learning rate as 0.
+                    )
+                )
+                for aid in range(args.num_agents)
+            },
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    # Define some stopping criterion. Note that this criterion is an avg episode return
+    # to be reached.
+    metric = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    stop = {metric: args.stop_reward_first_config}
+
+    tuner_results = run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        keep_ray_up=True,
+    )
+
+    # Perform a very quick test to make sure our algo (upon restoration) did not lose
+    # its ability to perform well in the env.
+    # - Extract the best checkpoint.
+    best_result = tuner_results.get_best_result(metric=metric, mode="max")
+    assert (
+        best_result.metrics[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_first_config
+    )
+    best_checkpoint_path = best_result.checkpoint.path
+
+    # Rebuild the algorithm (just for testing purposes).
+    test_algo = base_config.build()
+    # Load algo's state from the best checkpoint.
+    test_algo.restore_from_path(best_checkpoint_path)
+    # Perform some checks on the restored state.
+    assert test_algo.training_iteration > 0
+    # Evaluate on the restored algorithm.
+    test_eval_results = test_algo.evaluate()
+    assert (
+        test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_first_config
+    ), test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    # Train one iteration to make sure, the performance does not collapse (e.g. due
+    # to the optimizer weights not having been restored properly).
+    test_results = test_algo.train()
+    assert (
+        test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_first_config
+    ), test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    # Stop the test algorithm again.
+    test_algo.stop()
+
+    # Make sure the algorithm gets restored from a checkpoint right after
+    # initialization. Note that this includes all subcomponents of the algorithm,
+    # including the optimizer states in the LearnerGroup/Learner actors.
+    def on_algorithm_init(algorithm, **kwargs):
+        module_p0 = algorithm.get_module("p0")
+        weight_before = convert_to_numpy(next(iter(module_p0.parameters())))
+
+        algorithm.restore_from_path(best_checkpoint_path)
+
+        # Make sure weights were restored (changed).
+        weight_after = convert_to_numpy(next(iter(module_p0.parameters())))
+        check(weight_before, weight_after, false=True)
+
+    # Change the config.
+    (
+        base_config
+        # Make sure the algorithm gets restored upon initialization.
+        .callbacks(on_algorithm_init=on_algorithm_init)
+        # Change training parameters considerably.
+        .training(
+            lr=0.0003,
+            train_batch_size=5000,
+            grad_clip=100.0,
+            gamma=0.996,
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+        )
+        # Make multi-CPU/GPU.
+        .learners(num_learners=2)
+        # Use more env runners and more envs per env runner.
+        .env_runners(num_env_runners=3, num_envs_per_env_runner=5)
+    )
+
+    # Update the stopping criterium to the final target return per episode.
+    stop = {metric: args.stop_reward}
+
+    # Run a new experiment with the (RLlib) callback `on_algorithm_init` restoring
+    # from the best checkpoint.
+    # Note that the new experiment starts again from iteration=0 (unlike when you
+    # use `tune.Tuner.restore()` after a crash or interrupted trial).
+    tuner_results = run_rllib_example_script_experiment(base_config, args, stop=stop)
+
+    # Assert that we have continued training with a different learning rate.
+    assert (
+        tuner_results[0].metrics[LEARNER_RESULTS][DEFAULT_MODULE_ID][
+            "default_optimizer_learning_rate"
+        ]
+        == base_config.lr
+        == 0.0003
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
new file mode 100644
index 0000000000000000000000000000000000000000..33204e52d5e94c601e27972b3c3c0ce2f5d5cb3a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
@@ -0,0 +1,146 @@
+"""Example extracting a checkpoint from n trials using one or more custom criteria.
+
+This example:
+    - runs a CartPole experiment with three different learning rates (three tune
+    "trials"). During the experiment, for each trial, we create a checkpoint at each
+    iteration.
+    - at the end of the experiment, we compare the trials and pick the one that
+    performed best, based on the criterion: Lowest episode count per single iteration
+    (for CartPole, a low episode count means the episodes are very long and thus the
+    reward is also very high).
+    - from that best trial (with the lowest episode count), we then pick those
+    checkpoints that a) have the lowest policy loss (good) and b) have the highest value
+    function loss (bad).
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see the performance of the three different learning
+rates used here:
+
++-----------------------------+------------+-----------------+--------+--------+
+| Trial name                  | status     | loc             |     lr |   iter |
+|-----------------------------+------------+-----------------+--------+--------+
+| PPO_CartPole-v1_d7dbe_00000 | TERMINATED | 127.0.0.1:98487 | 0.01   |     17 |
+| PPO_CartPole-v1_d7dbe_00001 | TERMINATED | 127.0.0.1:98488 | 0.001  |      8 |
+| PPO_CartPole-v1_d7dbe_00002 | TERMINATED | 127.0.0.1:98489 | 0.0001 |      9 |
++-----------------------------+------------+-----------------+--------+--------+
+
++------------------+-------+----------+----------------------+----------------------+
+|   total time (s) |    ts |   reward |   episode_reward_max |   episode_reward_min |
+|------------------+-------+----------+----------------------+----------------------+
+|          28.1068 | 39797 |   151.11 |                  500 |                   12 |
+|          13.304  | 18728 |   158.91 |                  500 |                   15 |
+|          14.8848 | 21069 |   167.36 |                  500 |                   13 |
++------------------+-------+----------+----------------------+----------------------+
+
++--------------------+
+|   episode_len_mean |
+|--------------------|
+|             151.11 |
+|             158.91 |
+|             167.36 |
++--------------------+
+"""
+
+from ray import tune
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    LEARNER_RESULTS,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = add_rllib_example_script_args(
+    default_reward=450.0, default_timesteps=100000, default_iters=200
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Force-set `args.checkpoint_freq` to 1.
+    args.checkpoint_freq = 1
+
+    # Simple generic config.
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("CartPole-v1")
+        # Run 3 trials, each w/ a different learning rate.
+        .training(lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341)
+    )
+    # Run tune for some iterations and generate checkpoints.
+    results = run_rllib_example_script_experiment(base_config, args)
+
+    # Get the best of the 3 trials by using some metric.
+    # NOTE: Choosing the min `episodes_this_iter` automatically picks the trial
+    # with the best performance (over the entire run (scope="all")):
+    # The fewer episodes, the longer each episode lasted, the more reward we
+    # got each episode.
+    # Setting scope to "last", "last-5-avg", or "last-10-avg" will only compare
+    # (using `mode=min|max`) the average values of the last 1, 5, or 10
+    # iterations with each other, respectively.
+    # Setting scope to "avg" will compare (using `mode`=min|max) the average
+    # values over the entire run.
+    metric = "env_runners/num_episodes"
+    # notice here `scope` is `all`, meaning for each trial,
+    # all results (not just the last one) will be examined.
+    best_result = results.get_best_result(metric=metric, mode="min", scope="all")
+    value_best_metric = best_result.metrics_dataframe[metric].min()
+    best_return_best = best_result.metrics_dataframe[
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    ].max()
+    print(
+        f"Best trial was the one with lr={best_result.metrics['config']['lr']}. "
+        f"Reached lowest episode count ({value_best_metric}) in a single iteration and "
+        f"an average return of {best_return_best}."
+    )
+
+    # Confirm, we picked the right trial.
+
+    assert (
+        value_best_metric
+        == results.get_dataframe(filter_metric=metric, filter_mode="min")[metric].min()
+    )
+
+    # Get the best checkpoints from the trial, based on different metrics.
+    # Checkpoint with the lowest policy loss value:
+    if args.enable_new_api_stack:
+        policy_loss_key = f"{LEARNER_RESULTS}/{DEFAULT_MODULE_ID}/policy_loss"
+    else:
+        policy_loss_key = "info/learner/default_policy/learner_stats/policy_loss"
+    best_result = results.get_best_result(metric=policy_loss_key, mode="min")
+    ckpt = best_result.checkpoint
+    lowest_policy_loss = best_result.metrics_dataframe[policy_loss_key].min()
+    print(f"Checkpoint w/ lowest policy loss ({lowest_policy_loss}): {ckpt}")
+
+    # Checkpoint with the highest value-function loss:
+    if args.enable_new_api_stack:
+        vf_loss_key = f"{LEARNER_RESULTS}/{DEFAULT_MODULE_ID}/vf_loss"
+    else:
+        vf_loss_key = "info/learner/default_policy/learner_stats/vf_loss"
+    best_result = results.get_best_result(metric=vf_loss_key, mode="max")
+    ckpt = best_result.checkpoint
+    highest_value_fn_loss = best_result.metrics_dataframe[vf_loss_key].max()
+    print(f"Checkpoint w/ highest value function loss: {ckpt}")
+    print(f"Highest value function loss: {highest_value_fn_loss}")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/continue_training_from_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/continue_training_from_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..449489340de0db6c62d68601ec896f5560cc82be
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/continue_training_from_checkpoint.py
@@ -0,0 +1,268 @@
+"""Example showing how to restore an Algorithm from a checkpoint and resume training.
+
+Use the setup shown in this script if your experiments tend to crash after some time,
+and you would therefore like to make your setup more robust and fault-tolerant.
+
+This example:
+    - runs a single- or multi-agent CartPole experiment (for multi-agent, we use
+    different learning rates) thereby checkpointing the state of the Algorithm every n
+    iterations.
+    - stops the experiment due to an expected crash in the algorithm's main process
+    after a certain number of iterations.
+    - just for testing purposes, restores the entire algorithm from the latest
+    checkpoint and checks, whether the state of the restored algo exactly match the
+    state of the crashed one.
+    - then continues training with the restored algorithm until the desired final
+    episode return is reached.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=[0 or 2]
+--stop-reward-crash=[the episode return after which the algo should crash]
+--stop-reward=[the final episode return to achieve after(!) restoration from the
+checkpoint]
+`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+First, you should see the initial tune.Tuner do it's thing:
+
+Trial status: 1 RUNNING
+Current time: 2024-06-03 12:03:39. Total running time: 30s
+Logical resource usage: 3.0/12 CPUs, 0/0 GPUs
+╭────────────────────────────────────────────────────────────────────────
+│ Trial name                    status       iter     total time (s)
+├────────────────────────────────────────────────────────────────────────
+│ PPO_CartPole-v1_7b1eb_00000   RUNNING         6             15.362
+╰────────────────────────────────────────────────────────────────────────
+───────────────────────────────────────────────────────────────────────╮
+..._sampled_lifetime     ..._trained_lifetime     ...episodes_lifetime │
+───────────────────────────────────────────────────────────────────────┤
+               24000                    24000                      340 │
+───────────────────────────────────────────────────────────────────────╯
+...
+
+then, you should see the experiment crashing as soon as the `--stop-reward-crash`
+has been reached:
+
+```RuntimeError: Intended crash after reaching trigger return.```
+
+At some point, the experiment should resume exactly where it left off (using
+the checkpoint and restored Tuner):
+
+Trial status: 1 RUNNING
+Current time: 2024-06-03 12:05:00. Total running time: 1min 0s
+Logical resource usage: 3.0/12 CPUs, 0/0 GPUs
+╭────────────────────────────────────────────────────────────────────────
+│ Trial name                    status       iter     total time (s)
+├────────────────────────────────────────────────────────────────────────
+│ PPO_CartPole-v1_7b1eb_00000   RUNNING        27            66.1451
+╰────────────────────────────────────────────────────────────────────────
+───────────────────────────────────────────────────────────────────────╮
+..._sampled_lifetime     ..._trained_lifetime     ...episodes_lifetime │
+───────────────────────────────────────────────────────────────────────┤
+              108000                   108000                      531 │
+───────────────────────────────────────────────────────────────────────╯
+
+And if you are using the `--as-test` option, you should see a finel message:
+
+```
+`env_runners/episode_return_mean` of 500.0 reached! ok
+```
+"""
+import re
+import time
+
+from ray import train, tune
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    check_learning_achieved,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+from ray.air.integrations.wandb import WandbLoggerCallback
+
+
+parser = add_rllib_example_script_args(
+    default_reward=500.0, default_timesteps=10000000, default_iters=2000
+)
+parser.add_argument(
+    "--stop-reward-crash",
+    type=float,
+    default=200.0,
+    help="Mean episode return after which the Algorithm should crash.",
+)
+# By default, set `args.checkpoint_freq` to 1 and `args.checkpoint_at_end` to True.
+parser.set_defaults(checkpoint_freq=1, checkpoint_at_end=True)
+
+
+class CrashAfterNIters(RLlibCallback):
+    """Callback that makes the algo crash after a certain avg. return is reached."""
+
+    def __init__(self):
+        super().__init__()
+        # We have to delay crashing by one iteration just so the checkpoint still
+        # gets created by Tune after(!) we have reached the trigger avg. return.
+        self._should_crash = False
+
+    def on_train_result(self, *, algorithm, metrics_logger, result, **kwargs):
+        # We had already reached the mean-return to crash, the last checkpoint written
+        # (the one from the previous iteration) should yield that exact avg. return.
+        if self._should_crash:
+            raise RuntimeError("Intended crash after reaching trigger return.")
+        # Reached crashing criterion, crash on next iteration.
+        elif result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= args.stop_reward_crash:
+            print(
+                "Reached trigger return of "
+                f"{result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}"
+            )
+            self._should_crash = True
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    register_env(
+        "ma_cart", lambda cfg: MultiAgentCartPole({"num_agents": args.num_agents})
+    )
+
+    # Simple generic config.
+    config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .api_stack(
+            enable_rl_module_and_learner=args.enable_new_api_stack,
+            enable_env_runner_and_connector_v2=args.enable_new_api_stack,
+        )
+        .environment("CartPole-v1" if args.num_agents == 0 else "ma_cart")
+        .env_runners(create_env_on_local_worker=True)
+        .training(lr=0.0001)
+        .callbacks(CrashAfterNIters)
+    )
+
+    # Tune config.
+    # Need a WandB callback?
+    tune_callbacks = []
+    if args.wandb_key:
+        project = args.wandb_project or (
+            args.algo.lower() + "-" + re.sub("\\W+", "-", str(config.env).lower())
+        )
+        tune_callbacks.append(
+            WandbLoggerCallback(
+                api_key=args.wandb_key,
+                project=args.wandb_project,
+                upload_checkpoints=False,
+                **({"name": args.wandb_run_name} if args.wandb_run_name else {}),
+            )
+        )
+
+    # Setup multi-agent, if required.
+    if args.num_agents > 0:
+        config.multi_agent(
+            policies={
+                f"p{aid}": PolicySpec(
+                    config=AlgorithmConfig.overrides(
+                        lr=5e-5
+                        * (aid + 1),  # agent 1 has double the learning rate as 0.
+                    )
+                )
+                for aid in range(args.num_agents)
+            },
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    # Define some stopping criterion. Note that this criterion is an avg episode return
+    # to be reached. The stop criterion does not consider the built-in crash we are
+    # triggering through our callback.
+    stop = {
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+
+    # Run tune for some iterations and generate checkpoints.
+    tuner = tune.Tuner(
+        trainable=config.algo_class,
+        param_space=config,
+        run_config=train.RunConfig(
+            callbacks=tune_callbacks,
+            checkpoint_config=train.CheckpointConfig(
+                checkpoint_frequency=args.checkpoint_freq,
+                checkpoint_at_end=args.checkpoint_at_end,
+            ),
+            stop=stop,
+        ),
+    )
+    tuner_results = tuner.fit()
+
+    # Perform a very quick test to make sure our algo (upon restoration) did not lose
+    # its ability to perform well in the env.
+    # - Extract the best checkpoint.
+    metric = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    best_result = tuner_results.get_best_result(metric=metric, mode="max")
+    assert (
+        best_result.metrics[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_crash
+    )
+    # - Change our config, such that the restored algo will have an env on the local
+    # EnvRunner (to perform evaluation) and won't crash anymore (remove the crashing
+    # callback).
+    config.callbacks(None)
+    # Rebuild the algorithm (just for testing purposes).
+    test_algo = config.build()
+    # Load algo's state from best checkpoint.
+    test_algo.restore(best_result.checkpoint)
+    # Perform some checks on the restored state.
+    assert test_algo.training_iteration > 0
+    # Evaluate on the restored algorithm.
+    test_eval_results = test_algo.evaluate()
+    assert (
+        test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_crash
+    ), test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    # Train one iteration to make sure, the performance does not collapse (e.g. due
+    # to the optimizer weights not having been restored properly).
+    test_results = test_algo.train()
+    assert (
+        test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= args.stop_reward_crash
+    ), test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    # Stop the test algorithm again.
+    test_algo.stop()
+
+    # Create a new Tuner from the existing experiment path (which contains the tuner's
+    # own checkpoint file). Note that even the WandB logging will be continued without
+    # creating a new WandB run name.
+    restored_tuner = tune.Tuner.restore(
+        path=tuner_results.experiment_path,
+        trainable=config.algo_class,
+        param_space=config,
+        # Important to set this to True b/c the previous trial had failed (due to our
+        # `CrashAfterNIters` callback).
+        resume_errored=True,
+    )
+    # Continue the experiment exactly where we left off.
+    tuner_results = restored_tuner.fit()
+
+    # Not sure, whether this is really necessary, but we have observed the WandB
+    # logger sometimes not logging some of the last iterations. This sleep here might
+    # give it enough time to do so.
+    time.sleep(20)
+
+    if args.as_test:
+        check_learning_achieved(tuner_results, args.stop_reward, metric=metric)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_tf.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fb7f3760328985de89edba7aee837cd569895f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_tf.py
@@ -0,0 +1,91 @@
+# @OldAPIStack
+import argparse
+import numpy as np
+import onnxruntime
+import os
+import shutil
+
+import ray
+import ray.rllib.algorithms.ppo as ppo
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2"],
+    default="tf2",
+    help="The TF framework specifier (either 'tf' or 'tf2').",
+)
+
+
+if __name__ == "__main__":
+
+    args = parser.parse_args()
+
+    # Configure our PPO Algorithm.
+    config = (
+        ppo.PPOConfig()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .env_runners(num_env_runners=1)
+        .framework(args.framework)
+    )
+
+    outdir = "export_tf"
+    if os.path.exists(outdir):
+        shutil.rmtree(outdir)
+
+    np.random.seed(1234)
+
+    # We will run inference with this test batch
+    test_data = {
+        "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32),
+    }
+
+    # Start Ray and initialize a PPO Algorithm
+    ray.init()
+    algo = config.build(env="CartPole-v1")
+
+    # You could train the model here via:
+    # algo.train()
+
+    # Let's run inference on the tensorflow model
+    policy = algo.get_policy()
+    result_tf, _ = policy.model(test_data)
+
+    # Evaluate tensor to fetch numpy array.
+    if args.framework == "tf":
+        with policy.get_session().as_default():
+            result_tf = result_tf.eval()
+
+    # This line will export the model to ONNX.
+    policy.export_model(outdir, onnx=11)
+    # Equivalent to:
+    # algo.export_policy_model(outdir, onnx=11)
+
+    # Import ONNX model.
+    exported_model_file = os.path.join(outdir, "model.onnx")
+
+    # Start an inference session for the ONNX model
+    session = onnxruntime.InferenceSession(exported_model_file, None)
+
+    # Pass the same test batch to the ONNX model (rename to match tensor names)
+    onnx_test_data = {f"default_policy/{k}:0": v for k, v in test_data.items()}
+
+    # Tf2 model stored differently from tf (static graph) model.
+    if args.framework == "tf2":
+        result_onnx = session.run(["fc_out"], {"observations": test_data["obs"]})
+    else:
+        result_onnx = session.run(
+            ["default_policy/model/fc_out/BiasAdd:0"],
+            onnx_test_data,
+        )
+
+    # These results should be equal!
+    print("TENSORFLOW", result_tf)
+    print("ONNX", result_onnx)
+
+    assert np.allclose(result_tf, result_onnx), "Model outputs are NOT equal. FAILED"
+    print("Model outputs are equal. PASSED")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7d39cc9225a79f73e9077c9207ed2f39237068d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch.py
@@ -0,0 +1,79 @@
+# @OldAPIStack
+
+from packaging.version import Version
+import numpy as np
+import ray
+import ray.rllib.algorithms.ppo as ppo
+import onnxruntime
+import os
+import shutil
+import torch
+
+if __name__ == "__main__":
+    # Configure our PPO Algorithm.
+    config = (
+        ppo.PPOConfig()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .env_runners(num_env_runners=1)
+        .framework("torch")
+    )
+
+    outdir = "export_torch"
+    if os.path.exists(outdir):
+        shutil.rmtree(outdir)
+
+    np.random.seed(1234)
+
+    # We will run inference with this test batch
+    test_data = {
+        "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32),
+        "state_ins": np.array([0.0], dtype=np.float32),
+    }
+
+    # Start Ray and initialize a PPO Algorithm.
+    ray.init()
+    algo = config.build(env="CartPole-v1")
+
+    # You could train the model here
+    # algo.train()
+
+    # Let's run inference on the torch model
+    policy = algo.get_policy()
+    result_pytorch, _ = policy.model(
+        {
+            "obs": torch.tensor(test_data["obs"]),
+        }
+    )
+
+    # Evaluate tensor to fetch numpy array
+    result_pytorch = result_pytorch.detach().numpy()
+
+    # This line will export the model to ONNX.
+    policy.export_model(outdir, onnx=11)
+    # Equivalent to:
+    # algo.export_policy_model(outdir, onnx=11)
+
+    # Import ONNX model.
+    exported_model_file = os.path.join(outdir, "model.onnx")
+
+    # Start an inference session for the ONNX model
+    session = onnxruntime.InferenceSession(exported_model_file, None)
+
+    # Pass the same test batch to the ONNX model
+    if Version(torch.__version__) < Version("1.9.0"):
+        # In torch < 1.9.0 the second input/output name gets mixed up
+        test_data["state_outs"] = test_data.pop("state_ins")
+
+    result_onnx = session.run(["output"], test_data)
+
+    # These results should be equal!
+    print("PYTORCH", result_pytorch)
+    print("ONNX", result_onnx)
+
+    assert np.allclose(
+        result_pytorch, result_onnx
+    ), "Model outputs are NOT equal. FAILED"
+    print("Model outputs are equal. PASSED")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch_lstm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d95a282a3a30d036afe39573850661c860df2740
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch_lstm.py
@@ -0,0 +1,136 @@
+# @OldAPIStack
+
+import numpy as np
+import onnxruntime
+
+import ray
+import ray.rllib.algorithms.ppo as ppo
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import add_rllib_example_script_args, check
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+
+torch, _ = try_import_torch()
+
+parser = add_rllib_example_script_args()
+parser.set_defaults(num_env_runners=1)
+
+
+class ONNXCompatibleWrapper(torch.nn.Module):
+    def __init__(self, original_model):
+        super(ONNXCompatibleWrapper, self).__init__()
+        self.original_model = original_model
+
+    def forward(self, a, b0, b1, c):
+        # Convert the separate tensor inputs back into the list format
+        # expected by the original model's forward method.
+        b = [b0, b1]
+        ret = self.original_model({"obs": a}, b, c)
+        # results, state_out_0, state_out_1
+        return ret[0], ret[1][0], ret[1][1]
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        not args.enable_new_api_stack
+    ), "Must NOT set --enable-new-api-stack when running this script!"
+
+    ray.init(local_mode=args.local_mode)
+
+    # Configure our PPO Algorithm.
+    config = (
+        ppo.PPOConfig()
+        # ONNX is not supported by RLModule API yet.
+        .api_stack(
+            enable_rl_module_and_learner=args.enable_new_api_stack,
+            enable_env_runner_and_connector_v2=args.enable_new_api_stack,
+        )
+        .environment("CartPole-v1")
+        .env_runners(num_env_runners=args.num_env_runners)
+        .training(model={"use_lstm": True})
+    )
+
+    B = 3
+    T = 5
+    LSTM_CELL = 256
+
+    # Input data for a python inference forward call.
+    test_data_python = {
+        "obs": np.random.uniform(0, 1.0, size=(B * T, 4)).astype(np.float32),
+        "state_ins": [
+            np.random.uniform(0, 1.0, size=(B, LSTM_CELL)).astype(np.float32),
+            np.random.uniform(0, 1.0, size=(B, LSTM_CELL)).astype(np.float32),
+        ],
+        "seq_lens": np.array([T] * B, np.float32),
+    }
+    # Input data for the ONNX session.
+    test_data_onnx = {
+        "obs": test_data_python["obs"],
+        "state_in_0": test_data_python["state_ins"][0],
+        "state_in_1": test_data_python["state_ins"][1],
+        "seq_lens": test_data_python["seq_lens"],
+    }
+
+    # Input data for compiling the ONNX model.
+    test_data_onnx_input = convert_to_torch_tensor(test_data_onnx)
+
+    # Initialize a PPO Algorithm.
+    algo = config.build()
+
+    # You could train the model here
+    # algo.train()
+
+    # Let's run inference on the torch model
+    policy = algo.get_policy()
+    result_pytorch, _ = policy.model(
+        {
+            "obs": torch.tensor(test_data_python["obs"]),
+        },
+        [
+            torch.tensor(test_data_python["state_ins"][0]),
+            torch.tensor(test_data_python["state_ins"][1]),
+        ],
+        torch.tensor(test_data_python["seq_lens"]),
+    )
+
+    # Evaluate tensor to fetch numpy array
+    result_pytorch = result_pytorch.detach().numpy()
+
+    # Wrap the actual ModelV2 with the torch wrapper above to make this all work with
+    # LSTMs (extra `state` in- and outputs and `seq_lens` inputs).
+    onnx_compatible = ONNXCompatibleWrapper(policy.model)
+    exported_model_file = "model.onnx"
+    input_names = [
+        "obs",
+        "state_in_0",
+        "state_in_1",
+        "seq_lens",
+    ]
+
+    # This line will export the model to ONNX.
+    torch.onnx.export(
+        onnx_compatible,
+        tuple(test_data_onnx_input[n] for n in input_names),
+        exported_model_file,
+        export_params=True,
+        opset_version=11,
+        do_constant_folding=True,
+        input_names=input_names,
+        output_names=[
+            "output",
+            "state_out_0",
+            "state_out_1",
+        ],
+        dynamic_axes={k: {0: "batch_size"} for k in input_names},
+    )
+    # Start an inference session for the ONNX model.
+    session = onnxruntime.InferenceSession(exported_model_file, None)
+    result_onnx = session.run(["output"], test_data_onnx)
+
+    # These results should be equal!
+    print("PYTORCH", result_pytorch)
+    print("ONNX", result_onnx[0])
+
+    check(result_pytorch, result_onnx[0])
+    print("Model outputs are equal. PASSED")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..acbbb83118943eecd1e897a572fbe5b8d9267c68
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
@@ -0,0 +1,171 @@
+"""Example demonstrating how to load module weights for 1 of n agents from a checkpoint.
+
+This example:
+    - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies, p0, p1, etc..
+    - Saves a checkpoint of the `MultiRLModule` every `--checkpoint-freq`
+    iterations.
+    - Stops the experiments after the agents reach a combined return of -800.
+    - Picks the best checkpoint by combined return and restores p0 from it.
+    - Runs a second experiment with the restored `RLModule` for p0 and
+    a fresh `RLModule` for the other policies.
+    - Stops the second experiment after the agents reach a combined return of -800.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2
+--checkpoint-freq=20 --checkpoint-at-end`
+
+Control the number of agents and policies (RLModules) via --num-agents and
+--num-policies.
+
+Control the number of checkpoints by setting `--checkpoint-freq` to a value > 0.
+Note that the checkpoint frequency is per iteration and this example needs at
+least a single checkpoint to load the RLModule weights for policy 0.
+If `--checkpoint-at-end` is set, a checkpoint will be saved at the end of the
+experiment.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should expect a reward of -400.0 eventually being achieved by a simple
+single PPO policy. In the second run of the experiment, the MultiRLModule weights
+for policy 0 are restored from the checkpoint of the first run. The reward for a
+single agent should be -400.0 again, but the training time should be shorter
+(around 30 iterations instead of 190) due to the fact that one policy is already
+an expert from the get go.
+"""
+
+from pathlib import Path
+
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.callbacks import DefaultCallbacks
+from ray.rllib.core import (
+    COMPONENT_LEARNER,
+    COMPONENT_LEARNER_GROUP,
+    COMPONENT_RL_MODULE,
+)
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    check,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+parser = add_rllib_example_script_args(
+    # Pendulum-v1 sum of 2 agents (each agent reaches -250).
+    default_reward=-500.0,
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    checkpoint_freq=1,
+    num_agents=2,
+)
+# TODO (sven): This arg is currently ignored (hard-set to 2).
+parser.add_argument("--num-policies", type=int, default=2)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Register our environment with tune.
+    if args.num_agents > 1:
+        register_env(
+            "env",
+            lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}),
+        )
+    else:
+        raise ValueError(
+            f"`num_agents` must be > 1, but is {args.num_agents}."
+            "Read the script docstring for more information."
+        )
+
+    assert args.checkpoint_freq > 0, (
+        "This example requires at least one checkpoint to load the RLModule "
+        "weights for policy 0."
+    )
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("env")
+        .training(
+            train_batch_size_per_learner=512,
+            minibatch_size=64,
+            lambda_=0.1,
+            gamma=0.95,
+            lr=0.0003,
+            vf_clip_param=10.0,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(fcnet_activation="relu"),
+        )
+    )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    # Augment the base config with further settings and train the agents.
+    results = run_rllib_example_script_experiment(base_config, args, keep_ray_up=True)
+
+    # Now swap in the RLModule weights for policy 0.
+    chkpt_path = results.get_best_result().checkpoint.path
+    p_0_module_state_path = (
+        Path(chkpt_path)  # <- algorithm's checkpoint dir
+        / COMPONENT_LEARNER_GROUP  # <- learner group
+        / COMPONENT_LEARNER  # <- learner
+        / COMPONENT_RL_MODULE  # <- MultiRLModule
+        / "p0"  # <- (single) RLModule
+    )
+
+    class LoadP0OnAlgoInitCallback(DefaultCallbacks):
+        def on_algorithm_init(self, *, algorithm, **kwargs):
+            module_p0 = algorithm.get_module("p0")
+            weight_before = convert_to_numpy(next(iter(module_p0.parameters())))
+            algorithm.restore_from_path(
+                p_0_module_state_path,
+                component=(
+                    COMPONENT_LEARNER_GROUP
+                    + "/"
+                    + COMPONENT_LEARNER
+                    + "/"
+                    + COMPONENT_RL_MODULE
+                    + "/p0"
+                ),
+            )
+            # Make sure weights were updated.
+            weight_after = convert_to_numpy(next(iter(module_p0.parameters())))
+            check(weight_before, weight_after, false=True)
+
+    base_config.callbacks(LoadP0OnAlgoInitCallback)
+
+    # Define stopping criteria.
+    stop = {
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -800.0,
+        f"{ENV_RUNNER_RESULTS}/{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 100000,
+        TRAINING_ITERATION: 100,
+    }
+
+    # Run the experiment again with the restored MultiRLModule.
+    run_rllib_example_script_experiment(base_config, args, stop=stop)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f741fec764ac0a65189274a67110b97e3c7f0eb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/count_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/count_based_curiosity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ff2c5d036f2d23219a61e2689ef3ff72bf7e642
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/count_based_curiosity.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08b5f8e5066e1b0bdbe5646796137a8af101128c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/intrinsic_curiosity_model_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/intrinsic_curiosity_model_based_curiosity.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0307f631429891c01a1a357689fdb190fafb5e2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/intrinsic_curiosity_model_based_curiosity.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/count_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/count_based_curiosity.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d1aa6e15530b221dfe8b53448ed611178faf90
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/count_based_curiosity.py
@@ -0,0 +1,137 @@
+"""Example of using a count-based curiosity mechanism to learn in sparse-rewards envs.
+
+This example:
+    - demonstrates how to define your own count-based curiosity ConnectorV2 piece
+    that computes intrinsic rewards based on simple observation counts and adds these
+    intrinsic rewards to the "main" (extrinsic) rewards.
+    - shows how this connector piece overrides the main (extrinsic) rewards in the
+    episode and thus demonstrates how to do reward shaping in general with RLlib.
+    - shows how to plug this connector piece into your algorithm's config.
+    - uses Tune and RLlib to learn the env described above and compares 2
+    algorithms, one that does use curiosity vs one that does not.
+
+We use a FrozenLake (sparse reward) environment with a map size of 8x8 and a time step
+limit of 14 to make it almost impossible for a non-curiosity based policy to learn.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--no-curiosity` flag to disable curiosity learning and force your policy
+to be trained on the task w/o the use of intrinsic rewards. With this option, the
+algorithm should NOT succeed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see that only a PPO policy that uses curiosity can
+actually learn.
+
+Policy using count-based curiosity:
++-------------------------------+------------+--------+------------------+
+| Trial name                    | status     |   iter |   total time (s) |
+|                               |            |        |                  |
+|-------------------------------+------------+--------+------------------+
+| PPO_FrozenLake-v1_109de_00000 | TERMINATED |     48 |            44.46 |
++-------------------------------+------------+--------+------------------+
++------------------------+-------------------------+------------------------+
+|    episode_return_mean |   num_episodes_lifetime |   num_env_steps_traine |
+|                        |                         |             d_lifetime |
+|------------------------+-------------------------+------------------------|
+|                   0.99 |                   12960 |                 194000 |
++------------------------+-------------------------+------------------------+
+
+Policy NOT using curiosity:
+[DOES NOT LEARN AT ALL]
+"""
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.connectors.classes.count_based_curiosity import (
+    CountBasedCuriosity,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = add_rllib_example_script_args(
+    default_reward=0.99, default_iters=200, default_timesteps=1000000
+)
+parser.set_defaults(enable_new_api_stack=True)
+parser.add_argument(
+    "--intrinsic-reward-coeff",
+    type=float,
+    default=1.0,
+    help="The weight with which to multiply intrinsic rewards before adding them to "
+    "the extrinsic ones (default is 1.0).",
+)
+parser.add_argument(
+    "--no-curiosity",
+    action="store_true",
+    help="Whether to NOT use count-based curiosity.",
+)
+
+ENV_OPTIONS = {
+    "is_slippery": False,
+    # Use this hard-to-solve 8x8 map with lots of holes (H) to fall into and only very
+    # few valid paths from the starting state (S) to the goal state (G).
+    "desc": [
+        "SFFHFFFH",
+        "FFFHFFFF",
+        "FFFHHFFF",
+        "FFFFFFFH",
+        "HFFHFFFF",
+        "HHFHFFHF",
+        "FFFHFHHF",
+        "FHFFFFFG",
+    ],
+    # Limit the number of steps the agent is allowed to make in the env to
+    # make it almost impossible to learn without (count-based) curiosity.
+    "max_episode_steps": 14,
+}
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "FrozenLake-v1",
+            env_config=ENV_OPTIONS,
+        )
+        .env_runners(
+            num_envs_per_env_runner=5,
+            # Flatten discrete observations (into one-hot vectors).
+            env_to_module_connector=lambda env: FlattenObservations(),
+        )
+        .training(
+            # The main code in this example: We add the `CountBasedCuriosity` connector
+            # piece to our Learner connector pipeline.
+            # This pipeline is fed with collected episodes (either directly from the
+            # EnvRunners in on-policy fashion or from a replay buffer) and converts
+            # these episodes into the final train batch. The added piece computes
+            # intrinsic rewards based on simple observation counts and add them to
+            # the "main" (extrinsic) rewards.
+            learner_connector=(
+                None if args.no_curiosity else lambda *ags, **kw: CountBasedCuriosity()
+            ),
+            num_epochs=10,
+            vf_loss_coeff=0.01,
+        )
+        .rl_module(model_config=DefaultModelConfig(vf_share_layers=True))
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/euclidian_distance_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/euclidian_distance_based_curiosity.py
new file mode 100644
index 0000000000000000000000000000000000000000..d471c17f18587c5a49f6c20518b0c1bfe9a05797
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/euclidian_distance_based_curiosity.py
@@ -0,0 +1,127 @@
+"""Example of a euclidian-distance curiosity mechanism to learn in sparse-rewards envs.
+
+This example:
+    - demonstrates how to define your own euclidian-distance-based curiosity ConnectorV2
+    piece that computes intrinsic rewards based on the delta between incoming
+    observations and some set of already stored (prior) observations. Thereby, the
+    further away the incoming observation is from the already stored ones, the higher
+    its corresponding intrinsic reward.
+    - shows how this connector piece adds the intrinsic reward to the corresponding
+    "main" (extrinsic) reward and overrides the value in the "rewards" key in the
+    episode. It thus demonstrates how to do reward shaping in general with RLlib.
+    - shows how to plug this connector piece into your algorithm's config.
+    - uses Tune and RLlib to learn the env described above and compares 2
+    algorithms, one that does use curiosity vs one that does not.
+
+We use the MountainCar-v0 environment, a sparse-reward env that is very hard to learn
+for a regular PPO algorithm.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--no-curiosity` flag to disable curiosity learning and force your policy
+to be trained on the task w/o the use of intrinsic rewards. With this option, the
+algorithm should NOT succeed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see that only a PPO policy that uses curiosity can
+actually learn.
+
+Policy using count-based curiosity:
++-------------------------------+------------+--------+------------------+
+| Trial name                    | status     |   iter |   total time (s) |
+|                               |            |        |                  |
+|-------------------------------+------------+--------+------------------+
+| PPO_FrozenLake-v1_109de_00000 | TERMINATED |     48 |            44.46 |
++-------------------------------+------------+--------+------------------+
++------------------------+-------------------------+------------------------+
+|    episode_return_mean |   num_episodes_lifetime |   num_env_steps_traine |
+|                        |                         |             d_lifetime |
+|------------------------+-------------------------+------------------------|
+|                   0.99 |                   12960 |                 194000 |
++------------------------+-------------------------+------------------------+
+
+Policy NOT using curiosity:
+[DOES NOT LEARN AT ALL]
+"""
+from ray.rllib.connectors.env_to_module import MeanStdFilter
+from ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity import (
+    EuclidianDistanceBasedCuriosity,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+# TODO (sven): SB3's PPO learns MountainCar-v0 until a reward of ~-110.
+#  We might have to play around some more with different initializations, etc..
+#  to get to these results as well.
+parser = add_rllib_example_script_args(
+    default_reward=-140.0, default_iters=2000, default_timesteps=1000000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_env_runners=4,
+)
+parser.add_argument(
+    "--intrinsic-reward-coeff",
+    type=float,
+    default=0.0001,
+    help="The weight with which to multiply intrinsic rewards before adding them to "
+    "the extrinsic ones (default is 0.0001).",
+)
+parser.add_argument(
+    "--no-curiosity",
+    action="store_true",
+    help="Whether to NOT use count-based curiosity.",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("MountainCar-v0")
+        .env_runners(
+            env_to_module_connector=lambda env: MeanStdFilter(),
+            num_envs_per_env_runner=5,
+        )
+        .training(
+            # The main code in this example: We add the
+            # `EuclidianDistanceBasedCuriosity` connector piece to our Learner connector
+            # pipeline. This pipeline is fed with collected episodes (either directly
+            # from the EnvRunners in on-policy fashion or from a replay buffer) and
+            # converts these episodes into the final train batch. The added piece
+            # computes intrinsic rewards based on simple observation counts and add them
+            # to the "main" (extrinsic) rewards.
+            learner_connector=(
+                None
+                if args.no_curiosity
+                else lambda *ags, **kw: EuclidianDistanceBasedCuriosity()
+            ),
+            # train_batch_size_per_learner=512,
+            grad_clip=20.0,
+            entropy_coeff=0.003,
+            gamma=0.99,
+            lr=0.0002,
+            lambda_=0.98,
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py
new file mode 100644
index 0000000000000000000000000000000000000000..e482147b9dcf17a8a6142e3106ec1684bf08590c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py
@@ -0,0 +1,313 @@
+"""Example of implementing and training with an intrinsic curiosity model (ICM).
+
+This type of curiosity-based learning trains a simplified model of the environment
+dynamics based on three networks:
+1) Embedding observations into latent space ("feature" network).
+2) Predicting the action, given two consecutive embedded observations
+("inverse" network).
+3) Predicting the next embedded obs, given an obs and action
+("forward" network).
+
+The less the ICM is able to predict the actually observed next feature vector,
+given obs and action (through the forwards network), the larger the
+"intrinsic reward", which will be added to the extrinsic reward of the agent.
+
+Therefore, if a state transition was unexpected, the agent becomes
+"curious" and will further explore this transition leading to better
+exploration in sparse rewards environments.
+
+For more details, see here:
+[1] Curiosity-driven Exploration by Self-supervised Prediction
+Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017.
+https://arxiv.org/pdf/1705.05363.pdf
+
+This example:
+    - demonstrates how to write a custom RLModule, representing the ICM from the paper
+    above. Note that this custom RLModule does not belong to any individual agent.
+    - demonstrates how to write a custom (PPO) TorchLearner that a) adds the ICM to its
+    MultiRLModule, b) trains the regular PPO Policy plus the ICM module, using the
+    PPO parent loss and the ICM's RLModule's own loss function.
+
+We use a FrozenLake (sparse reward) environment with a custom map size of 12x12 and a
+hard time step limit of 22 to make it almost impossible for a non-curiosity based
+learners to learn a good policy.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--no-curiosity` flag to disable curiosity learning and force your policy
+to be trained on the task w/o the use of intrinsic rewards. With this option, the
+algorithm should NOT succeed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see that only a PPO policy that uses curiosity can
+actually learn.
+
+Policy using ICM-based curiosity:
++-------------------------------+------------+-----------------+--------+
+| Trial name                    | status     | loc             |   iter |
+|-------------------------------+------------+-----------------+--------+
+| PPO_FrozenLake-v1_52ab2_00000 | TERMINATED | 127.0.0.1:73318 |    392 |
++-------------------------------+------------+-----------------+--------+
++------------------+--------+----------+--------------------+
+|   total time (s) |     ts |   reward |   episode_len_mean |
+|------------------+--------+----------+--------------------|
+|          236.652 | 786000 |      1.0 |               22.0 |
++------------------+--------+----------+--------------------+
+
+Policy NOT using curiosity:
+[DOES NOT LEARN AT ALL]
+"""
+from collections import defaultdict
+
+import numpy as np
+
+from ray import tune
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.examples.learners.classes.intrinsic_curiosity_learners import (
+    DQNTorchLearnerWithCuriosity,
+    PPOTorchLearnerWithCuriosity,
+)
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.learners.classes.intrinsic_curiosity_learners import (
+    ICM_MODULE_ID,
+)
+from ray.rllib.examples.rl_modules.classes.intrinsic_curiosity_model_rlm import (
+    IntrinsicCuriosityModel,
+)
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+parser = add_rllib_example_script_args(
+    default_iters=2000,
+    default_timesteps=10000000,
+    default_reward=0.9,
+)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+class MeasureMaxDistanceToStart(RLlibCallback):
+    """Callback measuring the dist of the agent to its start position in FrozenLake-v1.
+
+    Makes the naive assumption that the start position ("S") is in the upper left
+    corner of the used map.
+    Uses the MetricsLogger to record the (euclidian) distance value.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.max_dists = defaultdict(float)
+        self.max_dists_lifetime = 0.0
+
+    def on_episode_step(
+        self,
+        *,
+        episode,
+        env_runner,
+        metrics_logger,
+        env,
+        env_index,
+        rl_module,
+        **kwargs,
+    ):
+        num_rows = env.envs[0].unwrapped.nrow
+        num_cols = env.envs[0].unwrapped.ncol
+        obs = np.argmax(episode.get_observations(-1))
+        row = obs // num_cols
+        col = obs % num_rows
+        curr_dist = (row**2 + col**2) ** 0.5
+        if curr_dist > self.max_dists[episode.id_]:
+            self.max_dists[episode.id_] = curr_dist
+
+    def on_episode_end(
+        self,
+        *,
+        episode,
+        env_runner,
+        metrics_logger,
+        env,
+        env_index,
+        rl_module,
+        **kwargs,
+    ):
+        # Compute current maximum distance across all running episodes
+        # (including the just ended one).
+        max_dist = max(self.max_dists.values())
+        metrics_logger.log_value(
+            key="max_dist_travelled_across_running_episodes",
+            value=max_dist,
+            window=10,
+        )
+        if max_dist > self.max_dists_lifetime:
+            self.max_dists_lifetime = max_dist
+        del self.max_dists[episode.id_]
+
+    def on_sample_end(
+        self,
+        *,
+        env_runner,
+        metrics_logger,
+        samples,
+        **kwargs,
+    ):
+        metrics_logger.log_value(
+            key="max_dist_travelled_lifetime",
+            value=self.max_dists_lifetime,
+            window=1,
+        )
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.algo not in ["DQN", "PPO"]:
+        raise ValueError(
+            "Curiosity example only implemented for either DQN or PPO! See the "
+        )
+
+    base_config = (
+        tune.registry.get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "FrozenLake-v1",
+            env_config={
+                # Use a 12x12 map.
+                "desc": [
+                    "SFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFG",
+                ],
+                "is_slippery": False,
+                # Limit the number of steps the agent is allowed to make in the env to
+                # make it almost impossible to learn without the curriculum.
+                "max_episode_steps": 22,
+            },
+        )
+        .callbacks(MeasureMaxDistanceToStart)
+        .env_runners(
+            num_envs_per_env_runner=5 if args.algo == "PPO" else 1,
+            env_to_module_connector=lambda env: FlattenObservations(),
+        )
+        .training(
+            learner_config_dict={
+                # Intrinsic reward coefficient.
+                "intrinsic_reward_coeff": 0.05,
+                # Forward loss weight (vs inverse dynamics loss). Total ICM loss is:
+                # L(total ICM) = (
+                #     `forward_loss_weight` * L(forward)
+                #     + (1.0 - `forward_loss_weight`) * L(inverse_dyn)
+                # )
+                "forward_loss_weight": 0.2,
+            }
+        )
+        .rl_module(
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={
+                    # The "main" RLModule (policy) to be trained by our algo.
+                    DEFAULT_MODULE_ID: RLModuleSpec(
+                        **(
+                            {"model_config": {"vf_share_layers": True}}
+                            if args.algo == "PPO"
+                            else {}
+                        ),
+                    ),
+                    # The intrinsic curiosity model.
+                    ICM_MODULE_ID: RLModuleSpec(
+                        module_class=IntrinsicCuriosityModel,
+                        # Only create the ICM on the Learner workers, NOT on the
+                        # EnvRunners.
+                        learner_only=True,
+                        # Configure the architecture of the ICM here.
+                        model_config={
+                            "feature_dim": 288,
+                            "feature_net_hiddens": (256, 256),
+                            "feature_net_activation": "relu",
+                            "inverse_net_hiddens": (256, 256),
+                            "inverse_net_activation": "relu",
+                            "forward_net_hiddens": (256, 256),
+                            "forward_net_activation": "relu",
+                        },
+                    ),
+                }
+            ),
+            # Use a different learning rate for training the ICM.
+            algorithm_config_overrides_per_module={
+                ICM_MODULE_ID: AlgorithmConfig.overrides(lr=0.0005)
+            },
+        )
+    )
+
+    # Set PPO-specific hyper-parameters.
+    if args.algo == "PPO":
+        base_config.training(
+            num_epochs=6,
+            # Plug in the correct Learner class.
+            learner_class=PPOTorchLearnerWithCuriosity,
+            train_batch_size_per_learner=2000,
+            lr=0.0003,
+        )
+    elif args.algo == "DQN":
+        base_config.training(
+            # Plug in the correct Learner class.
+            learner_class=DQNTorchLearnerWithCuriosity,
+            train_batch_size_per_learner=128,
+            lr=0.00075,
+            replay_buffer_config={
+                "type": "PrioritizedEpisodeReplayBuffer",
+                "capacity": 500000,
+                "alpha": 0.6,
+                "beta": 0.4,
+            },
+            # Epsilon exploration schedule for DQN.
+            epsilon=[[0, 1.0], [500000, 0.05]],
+            n_step=(3, 5),
+            double_q=True,
+            dueling=True,
+        )
+
+    success_key = f"{ENV_RUNNER_RESULTS}/max_dist_travelled_across_running_episodes"
+    stop = {
+        success_key: 12.0,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+    }
+
+    run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        success_metric={success_key: stop[success_key]},
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd884835254d5e621d2f58598667eb642a11c00f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_experiment.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_experiment.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f022146932a898ded253196a1236cbb1ce2b1f85
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_experiment.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_logger.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_logger.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8696bd7ea00b7c0932bb35d0d69fbac1b7dbdfc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_logger.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_progress_reporter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_progress_reporter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34f6d53379b109aa70c84432466aafcc389c9799
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_progress_reporter.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ce75c11eb62f2d79813b73bb80fdd8954cbc53
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.py
@@ -0,0 +1,183 @@
+"""Example of a custom Ray Tune experiment wrapping an RLlib Algorithm.
+
+You should only use such a customized workflow if the following conditions apply:
+- You know exactly what you are doing :)
+- Configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig
+is not sufficient and doesn't allow you to shape the Algorithm into behaving the way
+you'd like. Note that for complex, custom evaluation procedures there are many
+AlgorithmConfig options one can use (for more details, see:
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py).  # noqa
+- Subclassing an RLlib Algorithm class and overriding the new class' `training_step`
+method is not sufficient and doesn't allow you to define the algorithm's execution
+logic the way you'd like. See an example here on how to customize the algorithm's
+`training_step()` method:
+https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py  # noqa
+
+
+How to run this script
+----------------------
+`python [script file name].py`
+
+
+Results to expect
+-----------------
+You should see the following output (at the end of the experiment) in your console:
+
+╭───────────────────────────────────────────────────────────────────────────────────────
+│ Trial name                              status         iter     total time (s)      ts
+├───────────────────────────────────────────────────────────────────────────────────────
+│ my_experiment_CartPole-v1_77083_00000   TERMINATED       10            36.7799   60000
+╰───────────────────────────────────────────────────────────────────────────────────────
+╭───────────────────────────────────────────────────────╮
+│     reward    episode_len_mean     episodes_this_iter │
+├───────────────────────────────────────────────────────┤
+│    254.821             254.821                     12 │
+╰───────────────────────────────────────────────────────╯
+evaluation episode returns=[500.0, 500.0, 500.0]
+
+Note that evaluation results (on the CartPole-v1 env) should be close to perfect
+(episode return of ~500.0) as we are acting greedily inside the evaluation procedure.
+"""
+from typing import Dict
+
+import numpy as np
+from ray import train, tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME
+
+torch, _ = try_import_torch()
+
+
+def my_experiment(config: Dict):
+
+    # Extract the number of iterations to run from the config.
+    train_iterations = config.pop("train-iterations", 2)
+    eval_episodes_to_do = config.pop("eval-episodes", 1)
+
+    config = (
+        PPOConfig()
+        .update_from_dict(config)
+        .api_stack(enable_rl_module_and_learner=True)
+        .environment("CartPole-v1")
+    )
+
+    # Train for n iterations with high LR.
+    config.training(lr=0.001)
+    algo_high_lr = config.build()
+    for _ in range(train_iterations):
+        train_results = algo_high_lr.train()
+        # Add the phase to the result dict.
+        train_results["phase"] = 1
+        train.report(train_results)
+        phase_high_lr_time = train_results[NUM_ENV_STEPS_SAMPLED_LIFETIME]
+    checkpoint_training_high_lr = algo_high_lr.save()
+    algo_high_lr.stop()
+
+    # Train for n iterations with low LR.
+    config.training(lr=0.00001)
+    algo_low_lr = config.build()
+    # Load state from the high-lr algo into this one.
+    algo_low_lr.restore(checkpoint_training_high_lr)
+    for _ in range(train_iterations):
+        train_results = algo_low_lr.train()
+        # Add the phase to the result dict.
+        train_results["phase"] = 2
+        # keep time moving forward
+        train_results[NUM_ENV_STEPS_SAMPLED_LIFETIME] += phase_high_lr_time
+        train.report(train_results)
+
+    checkpoint_training_low_lr = algo_low_lr.save()
+    algo_low_lr.stop()
+
+    # After training, run a manual evaluation procedure.
+
+    # Set the number of EnvRunners for collecting training data to 0 (local
+    # worker only).
+    config.env_runners(num_env_runners=0)
+
+    eval_algo = config.build()
+    # Load state from the low-lr algo into this one.
+    eval_algo.restore(checkpoint_training_low_lr)
+    # The algo's local worker (SingleAgentEnvRunner) that holds a
+    # gym.vector.Env object and an RLModule for computing actions.
+    local_env_runner = eval_algo.env_runner
+    # Extract the gymnasium env object from the created algo (its local
+    # SingleAgentEnvRunner worker). Note that the env in this single-agent
+    # case is a gymnasium vector env and that we get its first sub-env here.
+    env = local_env_runner.env.unwrapped.envs[0]
+
+    # The local worker (SingleAgentEnvRunner)
+    rl_module = local_env_runner.module
+
+    # Run a very simple env loop and add up rewards over a single episode.
+    obs, infos = env.reset()
+    episode_returns = []
+    episode_lengths = []
+    sum_rewards = length = 0
+    num_episodes = 0
+    while num_episodes < eval_episodes_to_do:
+        # Call the RLModule's `forward_inference()` method to compute an
+        # action.
+        rl_module_out = rl_module.forward_inference(
+            {
+                "obs": torch.from_numpy(np.expand_dims(obs, 0)),  # <- add B=1
+            }
+        )
+        action_logits = rl_module_out["action_dist_inputs"][0]  # <- remove B=1
+        action = np.argmax(action_logits.detach().cpu().numpy())  # act greedily
+
+        # Step the env.
+        obs, reward, terminated, truncated, info = env.step(action)
+
+        # Acculumate stats and reset the env, if necessary.
+        sum_rewards += reward
+        length += 1
+        if terminated or truncated:
+            num_episodes += 1
+            episode_returns.append(sum_rewards)
+            episode_lengths.append(length)
+            sum_rewards = length = 0
+            obs, infos = env.reset()
+
+    # Compile evaluation results.
+    eval_results = {
+        "eval_returns": episode_returns,
+        "eval_episode_lengths": episode_lengths,
+    }
+    # Combine the most recent training results with the just collected
+    # evaluation results.
+    results = {**train_results, **eval_results}
+    # Report everything.
+    train.report(results)
+
+
+if __name__ == "__main__":
+    base_config = PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0)
+    # Convert to a plain dict for Tune. Note that this is usually not needed, you can
+    # pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object.
+    # However, for demonstration purposes, we show here how you can add other, arbitrary
+    # keys to the plain config dict and then pass these keys to your custom experiment
+    # function.
+    config_dict = base_config.to_dict()
+
+    # Set a Special flag signalling `my_experiment` how many training steps to
+    # perform on each: the high learning rate and low learning rate.
+    config_dict["train-iterations"] = 5
+    # Set a Special flag signalling `my_experiment` how many episodes to evaluate for.
+    config_dict["eval-episodes"] = 3
+
+    training_function = tune.with_resources(
+        my_experiment,
+        resources=base_config.algo_class.default_resource_request(base_config),
+    )
+
+    tuner = tune.Tuner(
+        training_function,
+        # Pass in your config dict.
+        param_space=config_dict,
+    )
+    results = tuner.fit()
+    best_results = results.get_best_result()
+
+    print(f"evaluation episode returns={best_results.metrics['eval_returns']}")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_logger.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..9823e47daaec56082a45d8a021af33e52514542e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_logger.py
@@ -0,0 +1,137 @@
+"""Example showing how to define a custom Logger class for an RLlib Algorithm.
+
+The script uses the AlgorithmConfig's `debugging` API to setup the custom Logger:
+
+```
+config.debugging(logger_config={
+    "type": [some Logger subclass],
+    "ctor_arg1", ...,
+    "ctor_arg2", ...,
+})
+```
+
+All keys other than "type" in the logger_config dict will be passed into the Logger
+class's constructor.
+By default (logger_config=None), RLlib will construct a Ray Tune UnifiedLogger object,
+which logs results to JSON, CSV, and TBX.
+
+NOTE that a custom Logger is different from a custom `ProgressReporter`, which defines,
+how the (frequent) outputs to your console will be formatted. To see an example on how
+to write your own Progress reporter, see:
+https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_progress_reporter.py  # noqa
+
+Below examples include:
+- Disable logging entirely.
+- Using only one of tune's Json, CSV, or TBX loggers.
+- Defining a custom logger (by sub-classing tune.logger.py::Logger).
+
+
+How to run this script
+----------------------
+`python [script file name].py`
+
+
+Results to expect
+-----------------
+You should see log lines similar to the following in your console output. Note that
+these logged lines will mix with the ones produced by Tune's default ProgressReporter.
+See above link on how to setup a custom one.
+
+ABC Avg-return: 20.609375; pi-loss: -0.02921550187703246
+ABC Avg-return: 32.28688524590164; pi-loss: -0.023369029412534572
+ABC Avg-return: 51.92; pi-loss: -0.017113141975661456
+ABC Avg-return: 76.16; pi-loss: -0.01305474770361625
+ABC Avg-return: 100.54; pi-loss: -0.007665307738129169
+ABC Avg-return: 132.33; pi-loss: -0.005010405003325517
+ABC Avg-return: 169.65; pi-loss: -0.008397869592997183
+ABC Avg-return: 203.17; pi-loss: -0.005611495616764371
+Flushing
+Closing
+
+"""
+
+from ray import air, tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    LEARNER_RESULTS,
+)
+from ray.tune.logger import Logger, LegacyLoggerCallback
+
+
+class MyPrintLogger(Logger):
+    """Logs results by simply printing out everything."""
+
+    def _init(self):
+        # Custom init function.
+        print("Initializing ...")
+        # Setting up our log-line prefix.
+        self.prefix = self.config.get("logger_config").get("prefix")
+
+    def on_result(self, result: dict):
+        # Define, what should happen on receiving a `result` (dict).
+        mean_return = result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        pi_loss = result[LEARNER_RESULTS][DEFAULT_MODULE_ID]["policy_loss"]
+        print(f"{self.prefix} " f"Avg-return: {mean_return} " f"pi-loss: {pi_loss}")
+
+    def close(self):
+        # Releases all resources used by this logger.
+        print("Closing")
+
+    def flush(self):
+        # Flushing all possible disk writes to permanent storage.
+        print("Flushing", flush=True)
+
+
+if __name__ == "__main__":
+    config = (
+        PPOConfig().environment("CartPole-v1")
+        # Setting up a custom logger config.
+        # ----------------------------------
+        # The following are different examples of custom logging setups:
+        # 1) Disable logging entirely.
+        # "logger_config": {
+        #     # Use the tune.logger.NoopLogger class for no logging.
+        #     "type": "ray.tune.logger.NoopLogger",
+        # },
+        # 2) Use tune's JsonLogger only.
+        # Alternatively, use `CSVLogger` or `TBXLogger` instead of
+        # `JsonLogger` in the "type" key below.
+        # "logger_config": {
+        #     "type": "ray.tune.logger.JsonLogger",
+        #     # Optional: Custom logdir (do not define this here
+        #     # for using ~/ray_results/...).
+        #     "logdir": "/tmp",
+        # },
+        # 3) Custom logger (see `MyPrintLogger` class above).
+        .debugging(
+            logger_config={
+                # Provide the class directly or via fully qualified class
+                # path.
+                "type": MyPrintLogger,
+                # `config` keys:
+                "prefix": "ABC",
+                # Optional: Custom logdir (do not define this here
+                # for using ~/ray_results/...).
+                # "logdir": "/somewhere/on/my/file/system/"
+            }
+        )
+    )
+
+    stop = {f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 200.0}
+
+    # Run the actual experiment (using Tune).
+    results = tune.Tuner(
+        config.algo_class,
+        param_space=config,
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=2,
+            # Plugin our own logger.
+            callbacks=[
+                LegacyLoggerCallback([MyPrintLogger]),
+            ],
+        ),
+    ).fit()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_progress_reporter.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_progress_reporter.py
new file mode 100644
index 0000000000000000000000000000000000000000..092b0710db5746912b1d348d04a6780ee5f108ec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_progress_reporter.py
@@ -0,0 +1,119 @@
+"""Example showing how to set up a custom progress reporter for an RLlib Algorithm.
+
+The script sets the `progress_reporter` arg in the air.RunConfig and passes that to
+Tune's Tuner:
+
+```
+tune.Tuner(
+    param_space=...,  # <- your RLlib config
+    run_config=air.RunConfig(
+        progress_reporter=[some already instantiated TuneReporterBase object],
+    ),
+)
+```
+
+By default (progress_reporter=None), Tune will construct a default `CLIReporter` object,
+which reports the episode mean return, number of env steps sampled and -trained, and
+the total number of episodes run thus far.
+
+NOTE that a custom progress reporter is different from a custom `Logger`, which defines,
+how the (frequent) results are being formatted and written to e.g. a logfile.
+To see an example on how to write your own Logger, see:
+https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_logger.py
+
+
+How to run this script
+----------------------
+`python [script file name].py
+
+
+Results to expect
+-----------------
+You should see something similar to the following in your console output:
+
++---------------------+------------+-----------------+--------+------------------+
+| Trial name          | status     | loc             |   iter |   total time (s) |
+|---------------------+------------+-----------------+--------+------------------+
+| PPO_env_bb503_00000 | TERMINATED | 127.0.0.1:26303 |      5 |          30.3823 |
++---------------------+------------+-----------------+--------+------------------+
++-------+-------------------+------------------+------------------+------------------+
+|    ts |   combined return |   return policy1 |   return policy2 |   return policy3 |
+|-------+-------------------+------------------+------------------+------------------|
+| 20000 |             258.7 |            103.4 |            88.84 |            87.86 |
++-------+-------------------+------------------+------------------+------------------+
+
+"""
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+
+
+my_multi_agent_progress_reporter = tune.CLIReporter(
+    # In the following dict, the keys are the (possibly nested) keys that can be found
+    # in RLlib's (PPO's) result dict, produced at every training iteration, and the
+    # values are the column names you would like to see in your console reports.
+    # Note that for nested result dict keys, you need to use slashes "/" to define the
+    # exact path.
+    metric_columns={
+        **{
+            TRAINING_ITERATION: "iter",
+            "time_total_s": "total time (s)",
+            NUM_ENV_STEPS_SAMPLED_LIFETIME: "ts",
+            # RLlib always sums up all agents' rewards and reports it under:
+            # result_dict[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN].
+            f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": "combined return",
+        },
+        # Because RLlib sums up all returns of all agents, we would like to also
+        # see the individual agents' returns. We can find these under the result dict's
+        # 'env_runners/module_episode_returns_mean/' key (then the policy ID):
+        **{
+            f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/{pid}": f"return {pid}"
+            for pid in ["policy1", "policy2", "policy3"]
+        },
+    },
+)
+
+
+if __name__ == "__main__":
+    # Force Tuner to use old progress output as the new one silently ignores our custom
+    # `CLIReporter`.
+    # TODO (sven): Find out why we require this hack.
+    import os
+
+    os.environ["RAY_AIR_NEW_OUTPUT"] = "0"
+
+    # Register our multi-agent env with a fixed number of agents.
+    # The agents' IDs are 0, 1, and 2.
+    tune.register_env("env", lambda _: MultiAgentCartPole({"num_agents": 3}))
+
+    config = (
+        PPOConfig()
+        .environment("env")
+        .multi_agent(
+            # Define 3 policies. Note that in our simple setup, they are all configured
+            # the exact same way (with a PPO default RLModule/NN).
+            policies={"policy1", "policy2", "policy3"},
+            # Map agent 0 to "policy1", etc..
+            policy_mapping_fn=lambda agent_id, episode: f"policy{agent_id + 1}",
+        )
+    )
+
+    stop = {f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 200.0}
+
+    # Run the actual experiment (using Tune).
+    results = tune.Tuner(
+        config.algo_class,
+        param_space=config,
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=2,
+            # Plugin our own progress reporter.
+            progress_reporter=my_multi_agent_progress_reporter,
+        ),
+    ).fit()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f4c86bd3ff501c4175cbf3181af989acf82b57d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/action_masking_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/action_masking_rl_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..262c9dc89928211e5258534dd5359b96e66e9a21
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/action_masking_rl_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_cnn_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_cnn_rl_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c75e64039c0d1c542ace1d404869f3ecc87bcfff
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_cnn_rl_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_lstm_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_lstm_rl_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eca8a43ed68297af92126da58564fdc23e539ea0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_lstm_rl_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb426f898206288dae05fca65ece00a9e6f566ac
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_config.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a78b325863feab2de3b97a6959bd1c9e314aae2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/pretraining_single_agent_training_multi_agent.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/pretraining_single_agent_training_multi_agent.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..616459ffdf8f000876aea495a2339f77b586d415
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/pretraining_single_agent_training_multi_agent.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/action_masking_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/action_masking_rl_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd9984b9aceba10be7660f35cf1107af81a261d8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/action_masking_rl_module.py
@@ -0,0 +1,127 @@
+"""An example script showing how to define and load an `RLModule` that applies
+action masking
+
+This example:
+    - Defines an `RLModule` that applies action masking.
+    - It does so by using a `gymnasium.spaces.dict.Dict` observation space
+        with two keys, namely `"observations"`, holding the original observations
+        and `"action_mask"` defining the action mask for the current environment
+        state. Note, by this definition you can wrap any `gymnasium` environment
+        and use it for this module.
+    - Furthermore, it derives its `TorchRLModule` from the `PPOTorchRLModule` and
+        can therefore be easily plugged into our `PPO` algorithm.
+    - It overrides the `forward` methods of the `PPOTorchRLModule` to apply the
+        action masking and it overrides the `_compute_values` method for GAE
+        computation to extract the `"observations"` from the batch `Columns.OBS`
+        key.
+    - It uses the custom `ActionMaskEnv` that defines for each step a new action
+        mask that defines actions that are allowed (1.0) and others that are not
+        (0.0).
+    - It runs 10 iterations with PPO and finishes.
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-env-runners 2`
+
+Control the number of `EnvRunner`s with the `--num-env-runners` flag. This
+will increase the sampling speed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should expect a mean episode reward of around 0.35. The environment is a random
+environment paying out random rewards - so the agent cannot learn, but it can obey the
+action mask and should do so (no `AssertionError` should happen).
+After 40,000 environment steps and 10 training iterations the run should stop
+successfully:
+
++-------------------------------+------------+----------------------+--------+
+| Trial name                    | status     | loc                  |   iter |
+|                               |            |                      |        |
+|-------------------------------+------------+----------------------+--------+
+| PPO_ActionMaskEnv_dedc8_00000 | TERMINATED | 192.168.1.178:103298 |     10 |
++-------------------------------+------------+----------------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |   num_env_steps_sample |   num_env_steps_traine |
+|                  |             d_lifetime |             d_lifetime |
++------------------+------------------------+------------------------+
+|          57.9207 |                  40000 |                  40000 |
++------------------+------------------------+------------------------+
+*------------------------+
+|   num_episodes_lifetim |
+|                      e |
++------------------------|
+|                   3898 |
++------------------------+
+"""
+from gymnasium.spaces import Box, Discrete
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.action_mask_env import ActionMaskEnv
+from ray.rllib.examples.rl_modules.classes.action_masking_rlm import (
+    ActionMaskingTorchRLModule,
+)
+
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+
+parser = add_rllib_example_script_args(
+    default_iters=10,
+    default_timesteps=100000,
+    default_reward=150.0,
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.algo != "PPO":
+        raise ValueError("This example only supports PPO. Please use --algo=PPO.")
+
+    base_config = (
+        PPOConfig()
+        .environment(
+            env=ActionMaskEnv,
+            env_config={
+                "action_space": Discrete(100),
+                # This defines the 'original' observation space that is used in the
+                # `RLModule`. The environment will wrap this space into a
+                # `gym.spaces.Dict` together with an 'action_mask' that signals the
+                # `RLModule` to adapt the action distribution inputs for the underlying
+                # `DefaultPPORLModule`.
+                "observation_space": Box(-1.0, 1.0, (5,)),
+            },
+        )
+        .rl_module(
+            # We need to explicitly specify here RLModule to use and
+            # the catalog needed to build it.
+            rl_module_spec=RLModuleSpec(
+                module_class=ActionMaskingTorchRLModule,
+                model_config={
+                    "head_fcnet_hiddens": [64, 64],
+                    "head_fcnet_activation": "relu",
+                },
+            ),
+        )
+        .evaluation(
+            evaluation_num_env_runners=1,
+            evaluation_interval=1,
+            # Run evaluation parallel to training to speed up the example.
+            evaluation_parallel_to_training=True,
+        )
+    )
+
+    # Run the example (with Tune).
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6525851c1ac8cecc0a4491a2bf04f21969ac921
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__init__.py
@@ -0,0 +1,10 @@
+from ray.rllib.examples.rl_modules.classes.rock_paper_scissors_heuristic_rlm import (
+    AlwaysSameHeuristicRLM,
+    BeatLastHeuristicRLM,
+)
+
+
+__all__ = [
+    "AlwaysSameHeuristicRLM",
+    "BeatLastHeuristicRLM",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24e992114ca3c95ffed5418c7d40534ac88aebc0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/action_masking_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/action_masking_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7aaf961ca0ed313a9807bcc4a8f9a814e5c8e1d4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/action_masking_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/autoregressive_actions_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/autoregressive_actions_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc0115d45319cbbf4b49479146c42f4455fae9d8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/autoregressive_actions_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/intrinsic_curiosity_model_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/intrinsic_curiosity_model_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e095c8d0ffe1d40e967ffeb7dad1c5885f81181
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/intrinsic_curiosity_model_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/lstm_containing_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/lstm_containing_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94bd74a1bff1d3ddeb4b549dec3ba49861a4af6b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/lstm_containing_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/mobilenet_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/mobilenet_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9442c5bff6193d6689decb0231acd2695429e19
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/mobilenet_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/modelv2_to_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/modelv2_to_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf4d4913065792bc4b37cf20161a96a72f768a12
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/modelv2_to_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/random_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/random_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..573676b1d7f312d5c1f7ac0d4ce35665bcb18f8f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/random_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/rock_paper_scissors_heuristic_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/rock_paper_scissors_heuristic_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..917c43b3ccc4278de0a66dc30eea53d4659143f7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/rock_paper_scissors_heuristic_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/tiny_atari_cnn_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/tiny_atari_cnn_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ac175eea839c3b9f3de0e840702b2f88e15210e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/tiny_atari_cnn_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_torch_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_torch_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3edcfe7ad1134c998bb59d845a8276d3b5c9dced
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_torch_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_using_shared_encoder_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_using_shared_encoder_rlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6bc98889ec1e95adbf42fa479eb836086366749
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_using_shared_encoder_rlm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/action_masking_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/action_masking_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..aee91f203c8790f7688094f79bec5eeeb4fadf33
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/action_masking_rlm.py
@@ -0,0 +1,210 @@
+import gymnasium as gym
+from typing import Dict, Optional, Tuple, Union
+
+from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.torch_utils import FLOAT_MIN
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+
+class ActionMaskingRLModule(RLModule):
+    """An RLModule that implements an action masking for safe RL.
+
+    This RLModule implements action masking to avoid unsafe/unwanted actions
+    dependent on the current state (observations). It does so by using an
+    environment generated action mask defining which actions are allowed and
+    which should be avoided. The action mask is extracted from the
+    environment's `gymnasium.spaces.dict.Dict` observation and applied after
+    the module's `forward`-pass to the action logits. The resulting action
+    logits prevent unsafe/unwanted actions to be sampled from the corresponding
+    action distribution.
+
+    Note, this RLModule is implemented for the `PPO` algorithm only. It is not
+    guaranteed to work with other algorithms. Furthermore, not that for this
+    module to work it requires an environment with a `gymnasium.spaces.dict.Dict`
+    observation space containing tow key, `"action_mask"` and `"observations"`.
+    """
+
+    @override(RLModule)
+    def __init__(
+        self,
+        *,
+        observation_space: Optional[gym.Space] = None,
+        action_space: Optional[gym.Space] = None,
+        inference_only: Optional[bool] = None,
+        learner_only: bool = False,
+        model_config: Optional[Union[dict, DefaultModelConfig]] = None,
+        catalog_class=None,
+        **kwargs,
+    ):
+        # If observation space is not of type `Dict` raise an error.
+        if not isinstance(observation_space, gym.spaces.dict.Dict):
+            raise ValueError(
+                "This RLModule requires the environment to provide a "
+                "`gym.spaces.Dict` observation space of the form: \n"
+                " {'action_mask': Box(0.0, 1.0, shape=(self.action_space.n,)),"
+                "  'observation_space': self.observation_space}"
+            )
+
+        # While the environment holds an observation space that contains, both,
+        # the action mask and the original observation space, the 'RLModule'
+        # receives only the `"observation"` element of the space, but not the
+        # action mask.
+        self.observation_space_with_mask = observation_space
+        self.observation_space = observation_space["observations"]
+
+        # Keeps track if observation specs have been checked already.
+        self._checked_observations = False
+
+        # The DefaultPPORLModule, in its constructor will build networks for the
+        # original observation space (i.e. without the action mask).
+        super().__init__(
+            observation_space=self.observation_space,
+            action_space=action_space,
+            inference_only=inference_only,
+            learner_only=learner_only,
+            model_config=model_config,
+            catalog_class=catalog_class,
+            **kwargs,
+        )
+
+
+class ActionMaskingTorchRLModule(ActionMaskingRLModule, PPOTorchRLModule):
+    @override(PPOTorchRLModule)
+    def setup(self):
+        super().setup()
+        # We need to reset here the observation space such that the
+        # super`s (`PPOTorchRLModule`) observation space is the
+        # original space (i.e. without the action mask) and `self`'s
+        # observation space contains the action mask.
+        self.observation_space = self.observation_space_with_mask
+
+    @override(PPOTorchRLModule)
+    def _forward_inference(
+        self, batch: Dict[str, TensorType], **kwargs
+    ) -> Dict[str, TensorType]:
+        # Preprocess the original batch to extract the action mask.
+        action_mask, batch = self._preprocess_batch(batch)
+        # Run the forward pass.
+        outs = super()._forward_inference(batch, **kwargs)
+        # Mask the action logits and return.
+        return self._mask_action_logits(outs, action_mask)
+
+    @override(PPOTorchRLModule)
+    def _forward_exploration(
+        self, batch: Dict[str, TensorType], **kwargs
+    ) -> Dict[str, TensorType]:
+        # Preprocess the original batch to extract the action mask.
+        action_mask, batch = self._preprocess_batch(batch)
+        # Run the forward pass.
+        outs = super()._forward_exploration(batch, **kwargs)
+        # Mask the action logits and return.
+        return self._mask_action_logits(outs, action_mask)
+
+    @override(PPOTorchRLModule)
+    def _forward_train(
+        self, batch: Dict[str, TensorType], **kwargs
+    ) -> Dict[str, TensorType]:
+        # Run the forward pass.
+        outs = super()._forward_train(batch, **kwargs)
+        # Mask the action logits and return.
+        return self._mask_action_logits(outs, batch["action_mask"])
+
+    @override(ValueFunctionAPI)
+    def compute_values(self, batch: Dict[str, TensorType], embeddings=None):
+        # Check, if the observations are still in `dict` form.
+        if isinstance(batch[Columns.OBS], dict):
+            # Preprocess the batch to extract the `observations` to `Columns.OBS`.
+            action_mask, batch = self._preprocess_batch(batch)
+            # NOTE: Because we manipulate the batch we need to add the `action_mask`
+            # to the batch to access them in `_forward_train`.
+            batch["action_mask"] = action_mask
+        # Call the super's method to compute values for GAE.
+        return super().compute_values(batch, embeddings)
+
+    def _preprocess_batch(
+        self, batch: Dict[str, TensorType], **kwargs
+    ) -> Tuple[TensorType, Dict[str, TensorType]]:
+        """Extracts observations and action mask from the batch
+
+        Args:
+            batch: A dictionary containing tensors (at least `Columns.OBS`)
+
+        Returns:
+            A tuple with the action mask tensor and the modified batch containing
+                the original observations.
+        """
+        # Check observation specs for action mask and observation keys.
+        self._check_batch(batch)
+
+        # Extract the available actions tensor from the observation.
+        action_mask = batch[Columns.OBS].pop("action_mask")
+
+        # Modify the batch for the `DefaultPPORLModule`'s `forward` method, i.e.
+        # pass only `"obs"` into the `forward` method.
+        batch[Columns.OBS] = batch[Columns.OBS].pop("observations")
+
+        # Return the extracted action mask and the modified batch.
+        return action_mask, batch
+
+    def _mask_action_logits(
+        self, batch: Dict[str, TensorType], action_mask: TensorType
+    ) -> Dict[str, TensorType]:
+        """Masks the action logits for the output of `forward` methods
+
+        Args:
+            batch: A dictionary containing tensors (at least action logits).
+            action_mask: A tensor containing the action mask for the current
+                observations.
+
+        Returns:
+            A modified batch with masked action logits for the action distribution
+            inputs.
+        """
+        # Convert action mask into an `[0.0][-inf]`-type mask.
+        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
+
+        # Mask the logits.
+        batch[Columns.ACTION_DIST_INPUTS] += inf_mask
+
+        # Return the batch with the masked action logits.
+        return batch
+
+    def _check_batch(self, batch: Dict[str, TensorType]) -> Optional[ValueError]:
+        """Assert that the batch includes action mask and observations.
+
+        Args:
+            batch: A dicitonary containing tensors (at least `Columns.OBS`) to be
+                checked.
+
+        Raises:
+            `ValueError` if the column `Columns.OBS`  does not contain observations
+                and action mask.
+        """
+        if not self._checked_observations:
+            if "action_mask" not in batch[Columns.OBS]:
+                raise ValueError(
+                    "No action mask found in observation. This `RLModule` requires "
+                    "the environment to provide observations that include an "
+                    "action mask (i.e. an observation space of the Dict space "
+                    "type that looks as follows: \n"
+                    "{'action_mask': Box(0.0, 1.0, shape=(self.action_space.n,)),"
+                    "'observations': self.observation_space}"
+                )
+            if "observations" not in batch[Columns.OBS]:
+                raise ValueError(
+                    "No observations found in observation. This 'RLModule` requires "
+                    "the environment to provide observations that include the original "
+                    "observations under a key `'observations'` in a dict (i.e. an "
+                    "observation space of the Dict space type that looks as follows: \n"
+                    "{'action_mask': Box(0.0, 1.0, shape=(self.action_space.n,)),"
+                    "'observations': <observation_space>}"
+                )
+            self._checked_observations = True
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e65783ae4a86255e03e693ada33efbb01ea70488
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py
@@ -0,0 +1,135 @@
+from typing import Dict
+
+import gymnasium as gym
+
+from ray.rllib.core import Columns
+from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
+from ray.rllib.models.torch.torch_distributions import (
+    TorchCategorical,
+    TorchDiagGaussian,
+    TorchMultiDistribution,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.torch_utils import one_hot
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+
+class AutoregressiveActionsRLM(TorchRLModule, ValueFunctionAPI):
+    """An RLModule that uses an autoregressive action distribution.
+
+    Actions are sampled in two steps. The first (prior) action component is sampled from
+    a categorical distribution. Then, the second (posterior) action component is sampled
+    from a posterior distribution that depends on the first action component and the
+    other input data (observations).
+
+    Note, this RLModule works in combination with any algorithm, whose Learners require
+    the `ValueFunctionAPI`.
+    """
+
+    @override(RLModule)
+    def setup(self):
+        super().setup()
+
+        # Assert the action space is correct.
+        assert isinstance(self.action_space, gym.spaces.Tuple)
+        assert isinstance(self.action_space[0], gym.spaces.Discrete)
+        assert self.action_space[0].n == 3
+        assert isinstance(self.action_space[1], gym.spaces.Box)
+
+        self._prior_net = nn.Sequential(
+            nn.Linear(
+                in_features=self.observation_space.shape[0],
+                out_features=256,
+            ),
+            nn.Tanh(),
+            nn.Linear(in_features=256, out_features=self.action_space[0].n),
+        )
+
+        self._posterior_net = nn.Sequential(
+            nn.Linear(
+                in_features=self.observation_space.shape[0] + self.action_space[0].n,
+                out_features=256,
+            ),
+            nn.Tanh(),
+            nn.Linear(in_features=256, out_features=self.action_space[1].shape[0] * 2),
+        )
+
+        # Build the value function head.
+        self._value_net = nn.Sequential(
+            nn.Linear(
+                in_features=self.observation_space.shape[0],
+                out_features=256,
+            ),
+            nn.Tanh(),
+            nn.Linear(in_features=256, out_features=1),
+        )
+
+    @override(TorchRLModule)
+    def _forward_inference(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]:
+        return self._pi(batch[Columns.OBS], inference=True)
+
+    @override(TorchRLModule)
+    def _forward_exploration(
+        self, batch: Dict[str, TensorType], **kwargs
+    ) -> Dict[str, TensorType]:
+        return self._pi(batch[Columns.OBS], inference=False)
+
+    @override(TorchRLModule)
+    def _forward_train(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]:
+        return self._forward_exploration(batch)
+
+    @override(ValueFunctionAPI)
+    def compute_values(self, batch: Dict[str, TensorType], embeddings=None):
+        # Value function forward pass.
+        vf_out = self._value_net(batch[Columns.OBS])
+        # Squeeze out last dimension (single node value head).
+        return vf_out.squeeze(-1)
+
+    def _pi(self, obs, inference: bool):
+        # Prior forward pass.
+        prior_out = self._prior_net(obs)
+        dist_a1 = TorchCategorical.from_logits(prior_out)
+
+        # If in inference mode, we need to set the distribution to be deterministic.
+        if inference:
+            dist_a1 = dist_a1.to_deterministic()
+        # Sample a1.
+        a1 = dist_a1.sample()
+
+        # Posterior forward pass.
+        posterior_batch = torch.cat(
+            [obs, one_hot(a1, self.action_space[0])],
+            dim=-1,
+        )
+        posterior_out = self._posterior_net(posterior_batch)
+        dist_a2 = TorchDiagGaussian.from_logits(posterior_out)
+        if inference:
+            dist_a2 = dist_a2.to_deterministic()
+
+        a2 = dist_a2.sample()
+
+        actions = (a1, a2)
+
+        # We need the log-probabilities for the loss.
+        outputs = {
+            Columns.ACTION_LOGP: (
+                TorchMultiDistribution((dist_a1, dist_a2)).logp(actions)
+            ),
+            Columns.ACTION_DIST_INPUTS: torch.cat([prior_out, posterior_out], dim=-1),
+            # Concatenate the prior and posterior actions and log probabilities.
+            Columns.ACTIONS: actions,
+        }
+
+        return outputs
+
+    @override(TorchRLModule)
+    def get_inference_action_dist_cls(self):
+        return TorchMultiDistribution.get_partial_dist_cls(
+            child_distribution_cls_struct=(TorchCategorical, TorchDiagGaussian),
+            input_lens=(3, 2),
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/intrinsic_curiosity_model_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/intrinsic_curiosity_model_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..efa3fcdb1d6ba36c07039c466489acce5c731cea
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/intrinsic_curiosity_model_rlm.py
@@ -0,0 +1,240 @@
+from typing import Any, Dict, TYPE_CHECKING
+
+import tree  # pip install dm_tree
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.apis import SelfSupervisedLossAPI
+from ray.rllib.core.rl_module.torch import TorchRLModule
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.torch_utils import one_hot
+from ray.rllib.utils.typing import ModuleID
+
+if TYPE_CHECKING:
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+    from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+
+torch, nn = try_import_torch()
+
+
+class IntrinsicCuriosityModel(TorchRLModule, SelfSupervisedLossAPI):
+    """An intrinsic curiosity model (ICM) as TorchRLModule for better exploration.
+
+    For more details, see:
+    [1] Curiosity-driven Exploration by Self-supervised Prediction
+    Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017.
+    https://arxiv.org/pdf/1705.05363.pdf
+
+    Learns a simplified model of the environment based on three networks:
+    1) Embedding observations into latent space ("feature" network).
+    2) Predicting the action, given two consecutive embedded observations
+    ("inverse" network).
+    3) Predicting the next embedded obs, given an obs and action
+    ("forward" network).
+
+    The less the agent is able to predict the actually observed next feature
+    vector, given obs and action (through the forwards network), the larger the
+    "intrinsic reward", which will be added to the extrinsic reward.
+    Therefore, if a state transition was unexpected, the agent becomes
+    "curious" and will further explore this transition leading to better
+    exploration in sparse rewards environments.
+
+    .. testcode::
+
+            import numpy as np
+            import gymnasium as gym
+            import torch
+
+            from ray.rllib.core import Columns
+            from ray.rllib.examples.rl_modules.classes.intrinsic_curiosity_model_rlm import (  # noqa
+                IntrinsicCuriosityModel
+            )
+
+            B = 10  # batch size
+            O = 4  # obs (1D) dim
+            A = 2  # num actions
+            f = 25  # feature dim
+
+            # Construct the RLModule.
+            icm_net = IntrinsicCuriosityModel(
+                observation_space=gym.spaces.Box(-1.0, 1.0, (O,), np.float32),
+                action_space=gym.spaces.Discrete(A),
+            )
+
+            # Create some dummy input.
+            obs = torch.from_numpy(
+                np.random.random_sample(size=(B, O)).astype(np.float32)
+            )
+            next_obs = torch.from_numpy(
+                np.random.random_sample(size=(B, O)).astype(np.float32)
+            )
+            actions = torch.from_numpy(
+                np.random.random_integers(0, A - 1, size=(B,))
+            )
+            input_dict = {
+                Columns.OBS: obs,
+                Columns.NEXT_OBS: next_obs,
+                Columns.ACTIONS: actions,
+            }
+
+            # Call `forward_train()` to get phi (feature vector from obs), next-phi
+            # (feature vector from next obs), and the intrinsic rewards (individual, per
+            # batch-item forward loss values).
+            print(icm_net.forward_train(input_dict))
+
+            # Print out the number of parameters.
+            num_all_params = sum(int(np.prod(p.size())) for p in icm_net.parameters())
+            print(f"num params = {num_all_params}")
+    """
+
+    @override(TorchRLModule)
+    def setup(self):
+        # Get the ICM achitecture settings from the `model_config` attribute:
+        cfg = self.model_config
+
+        feature_dim = cfg.get("feature_dim", 288)
+
+        # Build the feature model (encoder of observations to feature space).
+        layers = []
+        dense_layers = cfg.get("feature_net_hiddens", (256, 256))
+        # `in_size` is the observation space (assume a simple Box(1D)).
+        in_size = self.observation_space.shape[0]
+        for out_size in dense_layers:
+            layers.append(nn.Linear(in_size, out_size))
+            if cfg.get("feature_net_activation") not in [None, "linear"]:
+                layers.append(
+                    get_activation_fn(cfg["feature_net_activation"], "torch")()
+                )
+            in_size = out_size
+        # Last feature layer of n nodes (feature dimension).
+        layers.append(nn.Linear(in_size, feature_dim))
+        self._feature_net = nn.Sequential(*layers)
+
+        # Build the inverse model (predicting the action between two observations).
+        layers = []
+        dense_layers = cfg.get("inverse_net_hiddens", (256,))
+        # `in_size` is 2x the feature dim.
+        in_size = feature_dim * 2
+        for out_size in dense_layers:
+            layers.append(nn.Linear(in_size, out_size))
+            if cfg.get("inverse_net_activation") not in [None, "linear"]:
+                layers.append(
+                    get_activation_fn(cfg["inverse_net_activation"], "torch")()
+                )
+            in_size = out_size
+        # Last feature layer of n nodes (action space).
+        layers.append(nn.Linear(in_size, self.action_space.n))
+        self._inverse_net = nn.Sequential(*layers)
+
+        # Build the forward model (predicting the next observation from current one and
+        # action).
+        layers = []
+        dense_layers = cfg.get("forward_net_hiddens", (256,))
+        # `in_size` is the feature dim + action space (one-hot).
+        in_size = feature_dim + self.action_space.n
+        for out_size in dense_layers:
+            layers.append(nn.Linear(in_size, out_size))
+            if cfg.get("forward_net_activation") not in [None, "linear"]:
+                layers.append(
+                    get_activation_fn(cfg["forward_net_activation"], "torch")()
+                )
+            in_size = out_size
+        # Last feature layer of n nodes (feature dimension).
+        layers.append(nn.Linear(in_size, feature_dim))
+        self._forward_net = nn.Sequential(*layers)
+
+    @override(TorchRLModule)
+    def _forward_train(self, batch, **kwargs):
+        # Push both observations through feature net to get feature vectors (phis).
+        # We cat/batch them here for efficiency reasons (save one forward pass).
+        obs = tree.map_structure(
+            lambda obs, next_obs: torch.cat([obs, next_obs], dim=0),
+            batch[Columns.OBS],
+            batch[Columns.NEXT_OBS],
+        )
+        phis = self._feature_net(obs)
+        # Split again to yield 2 individual phi tensors.
+        phi, next_phi = torch.chunk(phis, 2)
+
+        # Predict next feature vector (next_phi) with forward model (using obs and
+        # actions).
+        predicted_next_phi = self._forward_net(
+            torch.cat(
+                [
+                    phi,
+                    one_hot(batch[Columns.ACTIONS].long(), self.action_space).float(),
+                ],
+                dim=-1,
+            )
+        )
+
+        # Forward loss term: Predicted phi - given phi and action - vs actually observed
+        # phi (square-root of L2 norm). Note that this is the intrinsic reward that
+        # will be used and the mean of this is the forward net loss.
+        forward_l2_norm_sqrt = 0.5 * torch.sum(
+            torch.pow(predicted_next_phi - next_phi, 2.0), dim=-1
+        )
+
+        output = {
+            Columns.INTRINSIC_REWARDS: forward_l2_norm_sqrt,
+            # Computed feature vectors (used to compute the losses later).
+            "phi": phi,
+            "next_phi": next_phi,
+        }
+
+        return output
+
+    @override(SelfSupervisedLossAPI)
+    def compute_self_supervised_loss(
+        self,
+        *,
+        learner: "TorchLearner",
+        module_id: ModuleID,
+        config: "AlgorithmConfig",
+        batch: Dict[str, Any],
+        fwd_out: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        module = learner.module[module_id].unwrapped()
+
+        # Forward net loss.
+        forward_loss = torch.mean(fwd_out[Columns.INTRINSIC_REWARDS])
+
+        # Inverse loss term (predicted action that led from phi to phi' vs
+        # actual action taken).
+        dist_inputs = module._inverse_net(
+            torch.cat([fwd_out["phi"], fwd_out["next_phi"]], dim=-1)
+        )
+        action_dist = module.get_train_action_dist_cls().from_logits(dist_inputs)
+
+        # Neg log(p); p=probability of observed action given the inverse-NN
+        # predicted action distribution.
+        inverse_loss = -action_dist.logp(batch[Columns.ACTIONS])
+        inverse_loss = torch.mean(inverse_loss)
+
+        # Calculate the ICM loss.
+        total_loss = (
+            config.learner_config_dict["forward_loss_weight"] * forward_loss
+            + (1.0 - config.learner_config_dict["forward_loss_weight"]) * inverse_loss
+        )
+
+        learner.metrics.log_dict(
+            {
+                "mean_intrinsic_rewards": forward_loss,
+                "forward_loss": forward_loss,
+                "inverse_loss": inverse_loss,
+            },
+            key=module_id,
+            window=1,
+        )
+
+        return total_loss
+
+    # Inference and exploration not supported (this is a world-model that should only
+    # be used for training).
+    @override(TorchRLModule)
+    def _forward(self, batch, **kwargs):
+        raise NotImplementedError(
+            "`IntrinsicCuriosityModel` should only be used for training! "
+            "Only calls to `forward_train()` supported."
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/lstm_containing_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/lstm_containing_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e05bd2c4c7f2aaf2e02eb4df834a85f2ec661f1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/lstm_containing_rlm.py
@@ -0,0 +1,152 @@
+from typing import Any, Dict, Optional
+
+import numpy as np
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI
+from ray.rllib.core.rl_module.torch import TorchRLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+
+class LSTMContainingRLModule(TorchRLModule, ValueFunctionAPI):
+    """An example TorchRLModule that contains an LSTM layer.
+
+    .. testcode::
+
+        import numpy as np
+        import gymnasium as gym
+
+        B = 10  # batch size
+        T = 5  # seq len
+        e = 25  # embedding dim
+        CELL = 32  # LSTM cell size
+
+        # Construct the RLModule.
+        my_net = LSTMContainingRLModule(
+            observation_space=gym.spaces.Box(-1.0, 1.0, (e,), np.float32),
+            action_space=gym.spaces.Discrete(4),
+            model_config={"lstm_cell_size": CELL}
+        )
+
+        # Create some dummy input.
+        obs = torch.from_numpy(
+            np.random.random_sample(size=(B, T, e)
+        ).astype(np.float32))
+        state_in = my_net.get_initial_state()
+        # Repeat state_in across batch.
+        state_in = tree.map_structure(
+            lambda s: torch.from_numpy(s).unsqueeze(0).repeat(B, 1), state_in
+        )
+        input_dict = {
+            Columns.OBS: obs,
+            Columns.STATE_IN: state_in,
+        }
+
+        # Run through all 3 forward passes.
+        print(my_net.forward_inference(input_dict))
+        print(my_net.forward_exploration(input_dict))
+        print(my_net.forward_train(input_dict))
+
+        # Print out the number of parameters.
+        num_all_params = sum(int(np.prod(p.size())) for p in my_net.parameters())
+        print(f"num params = {num_all_params}")
+    """
+
+    @override(TorchRLModule)
+    def setup(self):
+        """Use this method to create all the model components that you require.
+
+        Feel free to access the following useful properties in this class:
+        - `self.model_config`: The config dict for this RLModule class,
+        which should contain flxeible settings, for example: {"hiddens": [256, 256]}.
+        - `self.observation|action_space`: The observation and action space that
+        this RLModule is subject to. Note that the observation space might not be the
+        exact space from your env, but that it might have already gone through
+        preprocessing through a connector pipeline (for example, flattening,
+        frame-stacking, mean/std-filtering, etc..).
+        """
+        # Assume a simple Box(1D) tensor as input shape.
+        in_size = self.observation_space.shape[0]
+
+        # Get the LSTM cell size from the `model_config` attribute:
+        self._lstm_cell_size = self.model_config.get("lstm_cell_size", 256)
+        self._lstm = nn.LSTM(in_size, self._lstm_cell_size, batch_first=True)
+        in_size = self._lstm_cell_size
+
+        # Build a sequential stack.
+        layers = []
+        # Get the dense layer pre-stack configuration from the same config dict.
+        dense_layers = self.model_config.get("dense_layers", [128, 128])
+        for out_size in dense_layers:
+            # Dense layer.
+            layers.append(nn.Linear(in_size, out_size))
+            # ReLU activation.
+            layers.append(nn.ReLU())
+            in_size = out_size
+
+        self._fc_net = nn.Sequential(*layers)
+
+        # Logits layer (no bias, no activation).
+        self._pi_head = nn.Linear(in_size, self.action_space.n)
+        # Single-node value layer.
+        self._values = nn.Linear(in_size, 1)
+
+    @override(TorchRLModule)
+    def get_initial_state(self) -> Any:
+        return {
+            "h": np.zeros(shape=(self._lstm_cell_size,), dtype=np.float32),
+            "c": np.zeros(shape=(self._lstm_cell_size,), dtype=np.float32),
+        }
+
+    @override(TorchRLModule)
+    def _forward(self, batch, **kwargs):
+        # Compute the basic 1D embedding tensor (inputs to policy- and value-heads).
+        embeddings, state_outs = self._compute_embeddings_and_state_outs(batch)
+        logits = self._pi_head(embeddings)
+
+        # Return logits as ACTION_DIST_INPUTS (categorical distribution).
+        # Note that the default `GetActions` connector piece (in the EnvRunner) will
+        # take care of argmax-"sampling" from the logits to yield the inference (greedy)
+        # action.
+        return {
+            Columns.ACTION_DIST_INPUTS: logits,
+            Columns.STATE_OUT: state_outs,
+        }
+
+    @override(TorchRLModule)
+    def _forward_train(self, batch, **kwargs):
+        # Same logic as _forward, but also return embeddings to be used by value
+        # function branch during training.
+        embeddings, state_outs = self._compute_embeddings_and_state_outs(batch)
+        logits = self._pi_head(embeddings)
+        return {
+            Columns.ACTION_DIST_INPUTS: logits,
+            Columns.STATE_OUT: state_outs,
+            Columns.EMBEDDINGS: embeddings,
+        }
+
+    # We implement this RLModule as a ValueFunctionAPI RLModule, so it can be used
+    # by value-based methods like PPO or IMPALA.
+    @override(ValueFunctionAPI)
+    def compute_values(
+        self, batch: Dict[str, Any], embeddings: Optional[Any] = None
+    ) -> TensorType:
+        if embeddings is None:
+            embeddings, _ = self._compute_embeddings_and_state_outs(batch)
+        values = self._values(embeddings).squeeze(-1)
+        return values
+
+    def _compute_embeddings_and_state_outs(self, batch):
+        obs = batch[Columns.OBS]
+        state_in = batch[Columns.STATE_IN]
+        h, c = state_in["h"], state_in["c"]
+        # Unsqueeze the layer dim (we only have 1 LSTM layer).
+        embeddings, (h, c) = self._lstm(obs, (h.unsqueeze(0), c.unsqueeze(0)))
+        # Push through our FC net.
+        embeddings = self._fc_net(embeddings)
+        # Squeeze the layer dim (we only have 1 LSTM layer).
+        return embeddings, {"h": h.squeeze(0), "c": c.squeeze(0)}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/mobilenet_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..104792fda3fc76b89d99752b86a9d877814fa5d9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/mobilenet_rlm.py
@@ -0,0 +1,80 @@
+"""
+This example shows how to take full control over what models and action distribution
+are being built inside an RL Module. With this pattern, we can bypass a Catalog and
+explicitly define our own models within a given RL Module.
+"""
+# __sphinx_doc_begin__
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.algorithms.ppo.ppo import PPOConfig
+from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import (
+    DefaultPPOTorchRLModule,
+)
+from ray.rllib.core.models.configs import MLPHeadConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.random_env import RandomEnv
+from ray.rllib.examples._old_api_stack.models.mobilenet_v2_encoder import (
+    MobileNetV2EncoderConfig,
+    MOBILENET_INPUT_SHAPE,
+)
+from ray.rllib.core.models.configs import ActorCriticEncoderConfig
+
+
+class MobileNetTorchPPORLModule(DefaultPPOTorchRLModule):
+    """A DefaultPPORLModule with mobilenet v2 as an encoder.
+
+    The idea behind this model is to demonstrate how we can bypass catalog to
+    take full control over what models and action distribution are being built.
+    In this example, we do this to modify an existing RLModule with a custom encoder.
+    """
+
+    def setup(self):
+        mobilenet_v2_config = MobileNetV2EncoderConfig()
+        # Since we want to use PPO, which is an actor-critic algorithm, we need to
+        # use an ActorCriticEncoderConfig to wrap the base encoder config.
+        actor_critic_encoder_config = ActorCriticEncoderConfig(
+            base_encoder_config=mobilenet_v2_config
+        )
+
+        self.encoder = actor_critic_encoder_config.build(framework="torch")
+        mobilenet_v2_output_dims = mobilenet_v2_config.output_dims
+
+        pi_config = MLPHeadConfig(
+            input_dims=mobilenet_v2_output_dims,
+            output_layer_dim=2,
+        )
+
+        vf_config = MLPHeadConfig(
+            input_dims=mobilenet_v2_output_dims, output_layer_dim=1
+        )
+
+        self.pi = pi_config.build(framework="torch")
+        self.vf = vf_config.build(framework="torch")
+
+
+config = (
+    PPOConfig()
+    .rl_module(rl_module_spec=RLModuleSpec(module_class=MobileNetTorchPPORLModule))
+    .environment(
+        RandomEnv,
+        env_config={
+            "action_space": gym.spaces.Discrete(2),
+            # Test a simple Image observation space.
+            "observation_space": gym.spaces.Box(
+                0.0,
+                1.0,
+                shape=MOBILENET_INPUT_SHAPE,
+                dtype=np.float32,
+            ),
+        },
+    )
+    .env_runners(num_env_runners=0)
+    # The following training settings make it so that a training iteration is very
+    # quick. This is just for the sake of this example. PPO will not learn properly
+    # with these settings!
+    .training(train_batch_size_per_learner=32, minibatch_size=16, num_epochs=1)
+)
+
+config.build().train()
+# __sphinx_doc_end__
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/modelv2_to_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/modelv2_to_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5efbead7e66f01fa2dc5edfa4fc7afa85991e5c6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/modelv2_to_rlm.py
@@ -0,0 +1,206 @@
+import pathlib
+from typing import Any, Dict, Optional
+
+import tree
+from ray.rllib.core import Columns, DEFAULT_POLICY_ID
+from ray.rllib.core.rl_module.apis import ValueFunctionAPI
+from ray.rllib.core.rl_module.torch import TorchRLModule
+from ray.rllib.models.torch.torch_distributions import (
+    TorchCategorical,
+    TorchDiagGaussian,
+    TorchMultiCategorical,
+    TorchMultiDistribution,
+    TorchSquashedGaussian,
+)
+from ray.rllib.models.torch.torch_action_dist import (
+    TorchCategorical as OldTorchCategorical,
+    TorchDiagGaussian as OldTorchDiagGaussian,
+    TorchMultiActionDistribution as OldTorchMultiActionDistribution,
+    TorchMultiCategorical as OldTorchMultiCategorical,
+    TorchSquashedGaussian as OldTorchSquashedGaussian,
+)
+from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+
+torch, _ = try_import_torch()
+
+
+class ModelV2ToRLModule(TorchRLModule, ValueFunctionAPI):
+    """An RLModule containing a (old stack) ModelV2.
+
+    The `ModelV2` may be define either through
+    - an existing Policy checkpoint
+    - an existing Algorithm checkpoint (and a policy ID or "default_policy")
+    - or through an AlgorithmConfig object
+
+    The ModelV2 is created in the `setup` and contines to live through the lifetime
+    of the RLModule.
+    """
+
+    @override(TorchRLModule)
+    def setup(self):
+        # Try extracting the policy ID from this RLModule's config dict.
+        policy_id = self.model_config.get("policy_id", DEFAULT_POLICY_ID)
+
+        # Try getting the algorithm checkpoint from the `model_config`.
+        algo_checkpoint_dir = self.model_config.get("algo_checkpoint_dir")
+        if algo_checkpoint_dir:
+            algo_checkpoint_dir = pathlib.Path(algo_checkpoint_dir)
+            if not algo_checkpoint_dir.is_dir():
+                raise ValueError(
+                    "The `model_config` of your RLModule must contain a "
+                    "`algo_checkpoint_dir` key pointing to the algo checkpoint "
+                    "directory! You can find this dir inside the results dir of your "
+                    "experiment. You can then add this path "
+                    "through `config.rl_module(model_config={"
+                    "'algo_checkpoint_dir': [your algo checkpoint dir]})`."
+                )
+            policy_checkpoint_dir = algo_checkpoint_dir / "policies" / policy_id
+        # Try getting the policy checkpoint from the `model_config`.
+        else:
+            policy_checkpoint_dir = self.model_config.get("policy_checkpoint_dir")
+
+        # Create the ModelV2 from the Policy.
+        if policy_checkpoint_dir:
+            policy_checkpoint_dir = pathlib.Path(policy_checkpoint_dir)
+            if not policy_checkpoint_dir.is_dir():
+                raise ValueError(
+                    "The `model_config` of your RLModule must contain a "
+                    "`policy_checkpoint_dir` key pointing to the policy checkpoint "
+                    "directory! You can find this dir under the Algorithm's checkpoint "
+                    "dir in subdirectory: [algo checkpoint dir]/policies/[policy ID "
+                    "ex. `default_policy`]. You can then add this path through `config"
+                    ".rl_module(model_config={'policy_checkpoint_dir': "
+                    "[your policy checkpoint dir]})`."
+                )
+            # Create a temporary policy object.
+            policy = TorchPolicyV2.from_checkpoint(policy_checkpoint_dir)
+        # Create the ModelV2 from scratch using the config.
+        else:
+            config = self.model_config.get("old_api_stack_algo_config")
+            if not config:
+                raise ValueError(
+                    "The `model_config` of your RLModule must contain a "
+                    "`algo_config` key with a AlgorithmConfig object in it that "
+                    "contains all the settings that would be necessary to construct a "
+                    "old API stack Algorithm/Policy/ModelV2! You can add this setting "
+                    "through `config.rl_module(model_config={'algo_config': "
+                    "[your old config]})`."
+                )
+            # Get the multi-agent policies dict.
+            policy_dict, _ = config.get_multi_agent_setup(
+                spaces={
+                    policy_id: (self.observation_space, self.action_space),
+                },
+                default_policy_class=config.algo_class.get_default_policy_class(config),
+            )
+            config = config.to_dict()
+            config["__policy_id"] = policy_id
+            policy = policy_dict[policy_id].policy_class(
+                self.observation_space,
+                self.action_space,
+                config,
+            )
+
+        self._model_v2 = policy.model
+
+        # Translate the action dist classes from the old API stack to the new.
+        self.action_dist_class = self._translate_dist_class(policy.dist_class)
+
+        # Erase the torch policy from memory, so it can be garbage collected.
+        del policy
+
+    @override(TorchRLModule)
+    def _forward_inference(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return self._forward_pass(batch, inference=True)
+
+    @override(TorchRLModule)
+    def _forward_exploration(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return self._forward_inference(batch, **kwargs)
+
+    @override(TorchRLModule)
+    def _forward_train(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        out = self._forward_pass(batch, inference=False)
+        out[Columns.ACTION_LOGP] = self.get_train_action_dist_cls()(
+            out[Columns.ACTION_DIST_INPUTS]
+        ).logp(batch[Columns.ACTIONS])
+        out[Columns.VF_PREDS] = self._model_v2.value_function()
+        if Columns.STATE_IN in batch and Columns.SEQ_LENS in batch:
+            out[Columns.VF_PREDS] = torch.reshape(
+                out[Columns.VF_PREDS], [len(batch[Columns.SEQ_LENS]), -1]
+            )
+        return out
+
+    def _forward_pass(self, batch, inference=True):
+        # Translate states and seq_lens into old API stack formats.
+        batch = batch.copy()
+        state_in = batch.pop(Columns.STATE_IN, {})
+        state_in = [s for i, s in sorted(state_in.items())]
+        seq_lens = batch.pop(Columns.SEQ_LENS, None)
+
+        if state_in:
+            if inference and seq_lens is None:
+                seq_lens = torch.tensor(
+                    [1.0] * state_in[0].shape[0], device=state_in[0].device
+                )
+            elif not inference:
+                assert seq_lens is not None
+            # Perform the actual ModelV2 forward pass.
+            # A recurrent ModelV2 adds and removes the time-rank itself (whereas in the
+            # new API stack, the connector pipelines are responsible for doing this) ->
+            # We have to remove, then re-add the time rank here to make ModelV2 work.
+            batch = tree.map_structure(
+                lambda s: torch.reshape(s, [-1] + list(s.shape[2:])), batch
+            )
+        nn_output, state_out = self._model_v2(batch, state_in, seq_lens)
+        # Put back 1ts time rank into nn-output (inference).
+        if state_in:
+            if inference:
+                nn_output = tree.map_structure(
+                    lambda s: torch.unsqueeze(s, axis=1), nn_output
+                )
+            else:
+                nn_output = tree.map_structure(
+                    lambda s: torch.reshape(s, [len(seq_lens), -1] + list(s.shape[1:])),
+                    nn_output,
+                )
+        # Interpret the NN output as action logits.
+        output = {Columns.ACTION_DIST_INPUTS: nn_output}
+        # Add the `state_out` to the `output`, new API stack style.
+        if state_out:
+            output[Columns.STATE_OUT] = {}
+        for i, o in enumerate(state_out):
+            output[Columns.STATE_OUT][i] = o
+
+        return output
+
+    @override(ValueFunctionAPI)
+    def compute_values(self, batch: Dict[str, Any], embeddings: Optional[Any] = None):
+        self._forward_pass(batch, inference=False)
+        v_preds = self._model_v2.value_function()
+        if Columns.STATE_IN in batch and Columns.SEQ_LENS in batch:
+            v_preds = torch.reshape(v_preds, [len(batch[Columns.SEQ_LENS]), -1])
+        return v_preds
+
+    @override(TorchRLModule)
+    def get_initial_state(self):
+        """Converts the initial state list of ModelV2 into a dict (new API stack)."""
+        init_state_list = self._model_v2.get_initial_state()
+        return {i: s for i, s in enumerate(init_state_list)}
+
+    def _translate_dist_class(self, old_dist_class):
+        map_ = {
+            OldTorchCategorical: TorchCategorical,
+            OldTorchDiagGaussian: TorchDiagGaussian,
+            OldTorchMultiActionDistribution: TorchMultiDistribution,
+            OldTorchMultiCategorical: TorchMultiCategorical,
+            OldTorchSquashedGaussian: TorchSquashedGaussian,
+        }
+        if old_dist_class not in map_:
+            raise ValueError(
+                f"ModelV2ToRLModule does NOT support {old_dist_class} action "
+                f"distributions yet!"
+            )
+
+        return map_[old_dist_class]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/random_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/random_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e35292e212cfb90731eb188770222bf388f7d45a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/random_rlm.py
@@ -0,0 +1,71 @@
+import gymnasium as gym
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module import RLModule
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.spaces.space_utils import batch as batch_func
+
+
+class RandomRLModule(RLModule):
+    @override(RLModule)
+    def _forward(self, batch, **kwargs):
+        obs_batch_size = len(tree.flatten(batch[SampleBatch.OBS])[0])
+        actions = batch_func(
+            [self.action_space.sample() for _ in range(obs_batch_size)]
+        )
+        return {SampleBatch.ACTIONS: actions}
+
+    @override(RLModule)
+    def _forward_train(self, *args, **kwargs):
+        # RandomRLModule should always be configured as non-trainable.
+        # To do so, set in your config:
+        # `config.multi_agent(policies_to_train=[list of ModuleIDs to be trained,
+        # NOT including the ModuleID of this RLModule])`
+        raise NotImplementedError("Random RLModule: Should not be trained!")
+
+    @override(RLModule)
+    def output_specs_inference(self):
+        return [SampleBatch.ACTIONS]
+
+    @override(RLModule)
+    def output_specs_exploration(self):
+        return [SampleBatch.ACTIONS]
+
+    def compile(self, *args, **kwargs):
+        """Dummy method for compatibility with TorchRLModule.
+
+        This is hit when RolloutWorker tries to compile TorchRLModule."""
+        pass
+
+
+class StatefulRandomRLModule(RandomRLModule):
+    """A stateful RLModule that returns STATE_OUT from its forward methods.
+
+    - Implements the `get_initial_state` method (returning a all-zeros dummy state).
+    - Returns a dummy state under the `Columns.STATE_OUT` from its forward methods.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._internal_state_space = gym.spaces.Box(-1.0, 1.0, (1,))
+
+    @override(RLModule)
+    def get_initial_state(self):
+        return {
+            "state": np.zeros_like([self._internal_state_space.sample()]),
+        }
+
+    def _random_forward(self, batch, **kwargs):
+        batch = super()._random_forward(batch, **kwargs)
+        batch[Columns.STATE_OUT] = {
+            "state": batch_func(
+                [
+                    self._internal_state_space.sample()
+                    for _ in range(len(batch[Columns.ACTIONS]))
+                ]
+            ),
+        }
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/rock_paper_scissors_heuristic_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/rock_paper_scissors_heuristic_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4b3d661f4de3e970979d4c2c42b7b7464932ecd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/rock_paper_scissors_heuristic_rlm.py
@@ -0,0 +1,108 @@
+from collections import defaultdict
+
+import numpy as np
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+
+
+class AlwaysSameHeuristicRLM(RLModule):
+    """In rock-paper-scissors, always chooses the same action within an episode.
+
+    The first move is random, all the following moves are the same as the first one.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._actions_per_vector_idx = defaultdict(int)
+
+    @override(RLModule)
+    def _forward_inference(self, batch, **kwargs):
+        ret = []
+        # Note that the obs is the previous move of the opponens (0-2). If it's 3, it
+        # means that there was no previous move and thus, the episode just started.
+        for i, obs in enumerate(batch[Columns.OBS]):
+            if obs == 3:
+                self._actions_per_vector_idx[i] = np.random.choice([0, 1, 2])
+            ret.append(self._actions_per_vector_idx[i])
+        return {Columns.ACTIONS: np.array(ret)}
+
+    @override(RLModule)
+    def _forward_exploration(self, batch, **kwargs):
+        return self._forward_inference(batch, **kwargs)
+
+    @override(RLModule)
+    def _forward_train(self, batch, **kwargs):
+        raise NotImplementedError(
+            "AlwaysSameHeuristicRLM is not trainable! Make sure you do NOT include it "
+            "in your `config.multi_agent(policies_to_train={...})` set."
+        )
+
+    @override(RLModule)
+    def output_specs_inference(self):
+        return [Columns.ACTIONS]
+
+    @override(RLModule)
+    def output_specs_exploration(self):
+        return [Columns.ACTIONS]
+
+
+class BeatLastHeuristicRLM(RLModule):
+    """In rock-paper-scissors, always acts such that it beats prev. move of opponent.
+
+    The first move is random.
+
+    For example, after opponent played `rock` (and this policy made a random
+    move), the next move would be `paper`(to beat `rock`).
+    """
+
+    @override(RLModule)
+    def _forward_inference(self, batch, **kwargs):
+        """Returns the exact action that would beat the previous action of the opponent.
+
+        The opponent's previous action is the current observation for this agent.
+
+        Both action- and observation spaces are discrete. There are 3 actions available.
+        (0-2) and 4 observations (0-2 plus 3, where 3 is the observation after the env
+        reset, when no action has been taken yet). Thereby:
+        0=Rock
+        1=Paper
+        2=Scissors
+        3=[after reset] (observation space only)
+        """
+        return {
+            Columns.ACTIONS: np.array(
+                [self._pick_single_action(obs) for obs in batch[Columns.OBS]]
+            ),
+        }
+
+    @override(RLModule)
+    def _forward_exploration(self, batch, **kwargs):
+        return self._forward_inference(batch, **kwargs)
+
+    @override(RLModule)
+    def _forward_train(self, batch, **kwargs):
+        raise NotImplementedError(
+            "BeatLastHeuristicRLM is not trainable! Make sure you do NOT include it in "
+            "your `config.multi_agent(policies_to_train={...})` set."
+        )
+
+    @override(RLModule)
+    def output_specs_inference(self):
+        return [Columns.ACTIONS]
+
+    @override(RLModule)
+    def output_specs_exploration(self):
+        return [Columns.ACTIONS]
+
+    @staticmethod
+    def _pick_single_action(prev_opponent_obs):
+        if prev_opponent_obs == 0:
+            return 1
+        elif prev_opponent_obs == 1:
+            return 2
+        elif prev_opponent_obs == 2:
+            return 0
+        else:
+            return np.random.choice([0, 1, 2])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2e4e39b56a6f3963a247cf1969f92309970269
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py
@@ -0,0 +1,194 @@
+from typing import Any, Dict, Optional
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.utils import make_target_network
+from ray.rllib.core.rl_module.apis import (
+    TargetNetworkAPI,
+    ValueFunctionAPI,
+    TARGET_NETWORK_ACTION_DIST_INPUTS,
+)
+from ray.rllib.core.rl_module.torch import TorchRLModule
+from ray.rllib.models.torch.misc import (
+    normc_initializer,
+    same_padding,
+    valid_padding,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+
+class TinyAtariCNN(TorchRLModule, ValueFunctionAPI, TargetNetworkAPI):
+    """A tiny CNN stack for fast-learning of Atari envs.
+
+    The architecture here is the exact same as the one used by the old API stack as
+    CNN default ModelV2.
+
+    We stack 3 CNN layers based on the config, then a 4th one with linear activation
+    and n 1x1 filters, where n is the number of actions in the (discrete) action space.
+    Simple reshaping (no flattening or extra linear layers necessary) lead to the
+    action logits, which can directly be used inside a distribution or loss.
+
+    .. testcode::
+
+        import numpy as np
+        import gymnasium as gym
+
+        my_net = TinyAtariCNN(
+            observation_space=gym.spaces.Box(-1.0, 1.0, (42, 42, 4), np.float32),
+            action_space=gym.spaces.Discrete(4),
+        )
+
+        B = 10
+        w = 42
+        h = 42
+        c = 4
+        data = torch.from_numpy(
+            np.random.random_sample(size=(B, w, h, c)).astype(np.float32)
+        )
+        print(my_net.forward_inference({"obs": data}))
+        print(my_net.forward_exploration({"obs": data}))
+        print(my_net.forward_train({"obs": data}))
+
+        num_all_params = sum(int(np.prod(p.size())) for p in my_net.parameters())
+        print(f"num params = {num_all_params}")
+    """
+
+    @override(TorchRLModule)
+    def setup(self):
+        """Use this method to create all the model components that you require.
+
+        Feel free to access the following useful properties in this class:
+        - `self.model_config`: The config dict for this RLModule class,
+        which should contain flxeible settings, for example: {"hiddens": [256, 256]}.
+        - `self.observation|action_space`: The observation and action space that
+        this RLModule is subject to. Note that the observation space might not be the
+        exact space from your env, but that it might have already gone through
+        preprocessing through a connector pipeline (for example, flattening,
+        frame-stacking, mean/std-filtering, etc..).
+        """
+        # Get the CNN stack config from our RLModuleConfig's (self.config)
+        # `model_config` property:
+        conv_filters = self.model_config.get("conv_filters")
+        # Default CNN stack with 3 layers:
+        if conv_filters is None:
+            conv_filters = [
+                [16, 4, 2, "same"],  # num filters, kernel wxh, stride wxh, padding type
+                [32, 4, 2, "same"],
+                [256, 11, 1, "valid"],
+            ]
+
+        # Build the CNN layers.
+        layers = []
+
+        # Add user-specified hidden convolutional layers first
+        width, height, in_depth = self.observation_space.shape
+        in_size = [width, height]
+        for filter_specs in conv_filters:
+            if len(filter_specs) == 4:
+                out_depth, kernel_size, strides, padding = filter_specs
+            else:
+                out_depth, kernel_size, strides = filter_specs
+                padding = "same"
+
+            # Pad like in tensorflow's SAME mode.
+            if padding == "same":
+                padding_size, out_size = same_padding(in_size, kernel_size, strides)
+                layers.append(nn.ZeroPad2d(padding_size))
+            # No actual padding is performed for "valid" mode, but we will still
+            # compute the output size (input for the next layer).
+            else:
+                out_size = valid_padding(in_size, kernel_size, strides)
+
+            layer = nn.Conv2d(in_depth, out_depth, kernel_size, strides, bias=True)
+            # Initialize CNN layer kernel and bias.
+            nn.init.xavier_uniform_(layer.weight)
+            nn.init.zeros_(layer.bias)
+            layers.append(layer)
+            # Activation.
+            layers.append(nn.ReLU())
+
+            in_size = out_size
+            in_depth = out_depth
+
+        self._base_cnn_stack = nn.Sequential(*layers)
+
+        # Add the final CNN 1x1 layer with num_filters == num_actions to be reshaped to
+        # yield the logits (no flattening, no additional linear layers required).
+        _final_conv = nn.Conv2d(in_depth, self.action_space.n, 1, 1, bias=True)
+        nn.init.xavier_uniform_(_final_conv.weight)
+        nn.init.zeros_(_final_conv.bias)
+        self._logits = nn.Sequential(
+            nn.ZeroPad2d(same_padding(in_size, 1, 1)[0]), _final_conv
+        )
+
+        self._values = nn.Linear(in_depth, 1)
+        # Mimick old API stack behavior of initializing the value function with `normc`
+        # std=0.01.
+        normc_initializer(0.01)(self._values.weight)
+
+    @override(TorchRLModule)
+    def _forward(self, batch, **kwargs):
+        # Compute the basic 1D feature tensor (inputs to policy- and value-heads).
+        _, logits = self._compute_embeddings_and_logits(batch)
+        # Return features and logits as ACTION_DIST_INPUTS (categorical distribution).
+        return {
+            Columns.ACTION_DIST_INPUTS: logits,
+        }
+
+    @override(TorchRLModule)
+    def _forward_train(self, batch, **kwargs):
+        # Compute the basic 1D feature tensor (inputs to policy- and value-heads).
+        embeddings, logits = self._compute_embeddings_and_logits(batch)
+        # Return features and logits as ACTION_DIST_INPUTS (categorical distribution).
+        return {
+            Columns.ACTION_DIST_INPUTS: logits,
+            Columns.EMBEDDINGS: embeddings,
+        }
+
+    # We implement this RLModule as a TargetNetworkAPI RLModule, so it can be used
+    # by the APPO algorithm.
+    @override(TargetNetworkAPI)
+    def make_target_networks(self) -> None:
+        self._target_base_cnn_stack = make_target_network(self._base_cnn_stack)
+        self._target_logits = make_target_network(self._logits)
+
+    @override(TargetNetworkAPI)
+    def get_target_network_pairs(self):
+        return [
+            (self._base_cnn_stack, self._target_base_cnn_stack),
+            (self._logits, self._target_logits),
+        ]
+
+    @override(TargetNetworkAPI)
+    def forward_target(self, batch, **kw):
+        obs = batch[Columns.OBS].permute(0, 3, 1, 2)
+        embeddings = self._target_base_cnn_stack(obs)
+        logits = self._target_logits(embeddings)
+        return {TARGET_NETWORK_ACTION_DIST_INPUTS: torch.squeeze(logits, dim=[-1, -2])}
+
+    # We implement this RLModule as a ValueFunctionAPI RLModule, so it can be used
+    # by value-based methods like PPO or IMPALA.
+    @override(ValueFunctionAPI)
+    def compute_values(
+        self,
+        batch: Dict[str, Any],
+        embeddings: Optional[Any] = None,
+    ) -> TensorType:
+        # Features not provided -> We need to compute them first.
+        if embeddings is None:
+            obs = batch[Columns.OBS]
+            embeddings = self._base_cnn_stack(obs.permute(0, 3, 1, 2))
+            embeddings = torch.squeeze(embeddings, dim=[-1, -2])
+        return self._values(embeddings).squeeze(-1)
+
+    def _compute_embeddings_and_logits(self, batch):
+        obs = batch[Columns.OBS].permute(0, 3, 1, 2)
+        embeddings = self._base_cnn_stack(obs)
+        logits = self._logits(embeddings)
+        return (
+            torch.squeeze(embeddings, dim=[-1, -2]),
+            torch.squeeze(logits, dim=[-1, -2]),
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_torch_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_torch_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..676598d090dc01f3cc4c99089a2345fc15eebbc0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_torch_rlm.py
@@ -0,0 +1,50 @@
+import torch
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.torch import TorchRLModule
+
+
+class VPGTorchRLModule(TorchRLModule):
+    """A simple VPG (vanilla policy gradient)-style RLModule for testing purposes.
+
+    Use this as a minimum, bare-bones example implementation of a custom TorchRLModule.
+    """
+
+    def setup(self):
+        # You have access here to the following already set attributes:
+        # self.observation_space
+        # self.action_space
+        # self.inference_only
+        # self.model_config  # <- a dict with custom settings
+        input_dim = self.observation_space.shape[0]
+        hidden_dim = self.model_config["hidden_dim"]
+        output_dim = self.action_space.n
+
+        self._policy_net = torch.nn.Sequential(
+            torch.nn.Linear(input_dim, hidden_dim),
+            torch.nn.ReLU(),
+            torch.nn.Linear(hidden_dim, output_dim),
+        )
+
+    def _forward(self, batch, **kwargs):
+        # Push the observations from the batch through our pi-head.
+        action_logits = self._policy_net(batch[Columns.OBS])
+        # Return parameters for the (default) action distribution, which is
+        # `TorchCategorical` (due to our action space being `gym.spaces.Discrete`).
+        return {Columns.ACTION_DIST_INPUTS: action_logits}
+
+        # If you need more granularity between the different forward behaviors during
+        # the different phases of the module's lifecycle, implement three different
+        # forward methods. Thereby, it is recommended to put the inference and
+        # exploration versions inside a `with torch.no_grad()` context for better
+        # performance.
+        # def _forward_train(self, batch):
+        #    ...
+        #
+        # def _forward_inference(self, batch):
+        #    with torch.no_grad():
+        #        return self._forward_train(batch)
+        #
+        # def _forward_exploration(self, batch):
+        #    with torch.no_grad():
+        #        return self._forward_train(batch)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..471df1045ea3893065008c7e5772fe583d28ae4f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py
@@ -0,0 +1,170 @@
+import torch
+
+from ray.rllib.core import Columns
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
+
+
+SHARED_ENCODER_ID = "shared_encoder"
+
+
+# __sphinx_doc_policy_begin__
+class VPGPolicyAfterSharedEncoder(TorchRLModule):
+    """A VPG (vanilla pol. gradient)-style RLModule using a shared encoder.
+    # __sphinx_doc_policy_end__
+
+        The shared encoder RLModule must be held by the same MultiRLModule, under which
+        this RLModule resides. The shared encoder's forward is called before this
+        RLModule's forward and returns the embeddings under the "encoder_embeddings"
+        key.
+    # __sphinx_doc_policy_2_begin__
+    """
+
+    def setup(self):
+        super().setup()
+
+        # Incoming feature dim from the shared encoder.
+        embedding_dim = self.model_config["embedding_dim"]
+        hidden_dim = self.model_config["hidden_dim"]
+
+        self._pi_head = torch.nn.Sequential(
+            torch.nn.Linear(embedding_dim, hidden_dim),
+            torch.nn.ReLU(),
+            torch.nn.Linear(hidden_dim, self.action_space.n),
+        )
+
+    def _forward(self, batch, **kwargs):
+        # Embeddings can be found in the batch under the "encoder_embeddings" key.
+        embeddings = batch["encoder_embeddings"]
+        logits = self._pi_head(embeddings)
+        return {Columns.ACTION_DIST_INPUTS: logits}
+
+
+# __sphinx_doc_policy_2_end__
+
+
+# __sphinx_doc_mrlm_begin__
+class VPGMultiRLModuleWithSharedEncoder(MultiRLModule):
+    """VPG (vanilla pol. gradient)-style MultiRLModule handling a shared encoder.
+    # __sphinx_doc_mrlm_end__
+
+        This MultiRLModule needs to be configured appropriately as follows:
+
+        .. testcode::
+
+    # __sphinx_doc_how_to_run_begin__
+            import gymnasium as gym
+            from ray.rllib.algorithms.ppo import PPOConfig
+            from ray.rllib.core import MultiRLModuleSpec, RLModuleSpec
+            from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+
+            single_agent_env = gym.make("CartPole-v1")
+
+            EMBEDDING_DIM = 64  # encoder output dim
+
+            config = (
+                PPOConfig()
+                .environment(MultiAgentCartPole, env_config={"num_agents": 2})
+                .multi_agent(
+                    # Declare the two policies trained.
+                    policies={"p0", "p1"},
+                    # Agent IDs of `MultiAgentCartPole` are 0 and 1. They are mapped to
+                    # the two policies with ModuleIDs "p0" and "p1", respectively.
+                    policy_mapping_fn=lambda agent_id, episode, **kw: f"p{agent_id}"
+                )
+                .rl_module(
+                    rl_module_spec=MultiRLModuleSpec(
+                        rl_module_specs={
+                            # Shared encoder.
+                            SHARED_ENCODER_ID: RLModuleSpec(
+                                module_class=SharedEncoder,
+                                model_config={"embedding_dim": EMBEDDING_DIM},
+                                observation_space=single_agent_env.observation_space,
+                            ),
+                            # Large policy net.
+                            "p0": RLModuleSpec(
+                                module_class=VPGPolicyAfterSharedEncoder,
+                                model_config={
+                                    "embedding_dim": EMBEDDING_DIM,
+                                    "hidden_dim": 1024,
+                                },
+                            ),
+                            # Small policy net.
+                            "p1": RLModuleSpec(
+                                module_class=VPGPolicyAfterSharedEncoder,
+                                model_config={
+                                    "embedding_dim": EMBEDDING_DIM,
+                                    "hidden_dim": 64,
+                                },
+                            ),
+                        },
+                    ),
+                )
+            )
+            algo = config.build()
+            print(algo.get_module())
+    # __sphinx_doc_how_to_run_end__
+
+        Also note that in order to learn properly, a special, multi-agent Learner
+        accounting for the shared encoder must be setup. This Learner should have only
+        one optimizer (used to train all submodules: encoder and the n policy nets) in
+        order to not destabilize learning. The latter would happen if more than one
+        optimizer would try to alternatingly optimize the same shared encoder submodule.
+    # __sphinx_doc_mrlm_2_begin__
+    """
+
+    def setup(self):
+        # Call the super's setup().
+        super().setup()
+
+        # Assert, we have the shared encoder submodule.
+        assert (
+            SHARED_ENCODER_ID in self._rl_modules
+            and isinstance(self._rl_modules[SHARED_ENCODER_ID], SharedEncoder)
+            and len(self._rl_modules) > 1
+        )
+        # Assign the encoder to a convenience attribute.
+        self.encoder = self._rl_modules[SHARED_ENCODER_ID]
+
+    def _forward(self, batch, **kwargs):
+        # Collect our policies' outputs in this dict.
+        outputs = {}
+
+        # Loop through the policy nets (through the given batch's keys).
+        for policy_id, policy_batch in batch.items():
+            rl_module = self._rl_modules[policy_id]
+
+            # Pass policy's observations through shared encoder to get the features for
+            # this policy.
+            policy_batch["encoder_embeddings"] = self.encoder._forward(batch[policy_id])
+
+            # Pass the policy's embeddings through the policy net.
+            outputs[policy_id] = rl_module._forward(batch[policy_id], **kwargs)
+
+        return outputs
+
+
+# __sphinx_doc_mrlm_2_end__
+
+
+# __sphinx_doc_encoder_begin__
+class SharedEncoder(TorchRLModule):
+    """A shared encoder that can be used with `VPGMultiRLModuleWithSharedEncoder`."""
+
+    def setup(self):
+        super().setup()
+
+        input_dim = self.observation_space.shape[0]
+        embedding_dim = self.model_config["embedding_dim"]
+
+        # A very simple encoder network.
+        self._net = torch.nn.Sequential(
+            torch.nn.Linear(input_dim, embedding_dim),
+        )
+
+    def _forward(self, batch, **kwargs):
+        # Pass observations through the net and return outputs.
+        return {"encoder_embeddings": self._net(batch[Columns.OBS])}
+
+
+# __sphinx_doc_encoder_end__
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_cnn_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_cnn_rl_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..4001f3e21d6b8b09793f19355f0e7ae177634112
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_cnn_rl_module.py
@@ -0,0 +1,120 @@
+"""Example of implementing and configuring a custom (torch) CNN containing RLModule.
+
+This example:
+    - demonstrates how you can subclass the TorchRLModule base class and set up your
+    own CNN-stack architecture by overriding the `setup()` method.
+    - shows how to override the 3 forward methods: `_forward_inference()`,
+    `_forward_exploration()`, and `forward_train()` to implement your own custom forward
+    logic(s). You will also learn, when each of these 3 methods is called by RLlib or
+    the users of your RLModule.
+    - shows how you then configure an RLlib Algorithm such that it uses your custom
+    RLModule (instead of a default RLModule).
+
+We implement a tiny CNN stack here, the exact same one that is used by the old API
+stack as default CNN net. It comprises 4 convolutional layers, the last of which
+ends in a 1x1 filter size and the number of filters exactly matches the number of
+discrete actions (logits). This way, the (non-activated) output of the last layer only
+needs to be reshaped in order to receive the policy's logit outputs. No flattening
+or additional dense layer required.
+
+The network is then used in a fast ALE/Pong-v5 experiment.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see the following output (during the experiment) in your console:
+
+Number of trials: 1/1 (1 RUNNING)
++---------------------+----------+----------------+--------+------------------+
+| Trial name          | status   | loc            |   iter |   total time (s) |
+|                     |          |                |        |                  |
+|---------------------+----------+----------------+--------+------------------+
+| PPO_env_82b44_00000 | RUNNING  | 127.0.0.1:9718 |      1 |          98.3585 |
++---------------------+----------+----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|   num_env_steps_sample |   num_env_steps_traine |   num_episodes_lifetim |
+|             d_lifetime |             d_lifetime |                      e |
+|------------------------+------------------------+------------------------|
+|                   4000 |                   4000 |                      4 |
++------------------------+------------------------+------------------------+
+"""
+import gymnasium as gym
+
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
+from ray.rllib.examples.rl_modules.classes.tiny_atari_cnn_rlm import TinyAtariCNN
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+parser = add_rllib_example_script_args(default_iters=100, default_timesteps=600000)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    env="ale_py:ALE/Pong-v5",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    register_env(
+        "env",
+        lambda cfg: wrap_atari_for_new_api_stack(
+            gym.make(args.env, **cfg),
+            dim=42,  # <- need images to be "tiny" for our custom model
+            framestack=4,
+        ),
+    )
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            env="env",
+            env_config=dict(
+                frameskip=1,
+                full_action_space=False,
+                repeat_action_probability=0.0,
+            ),
+        )
+        .rl_module(
+            # Plug-in our custom RLModule class.
+            rl_module_spec=RLModuleSpec(
+                module_class=TinyAtariCNN,
+                # Feel free to specify your own `model_config` settings below.
+                # The `model_config` defined here will be available inside your
+                # custom RLModule class through the `self.model_config`
+                # property.
+                model_config={
+                    "conv_filters": [
+                        # num filters, kernel wxh, stride wxh, padding type
+                        [16, 4, 2, "same"],
+                        [32, 4, 2, "same"],
+                        [256, 11, 1, "valid"],
+                    ],
+                },
+            ),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_lstm_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_lstm_rl_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..85b160808bd7e6405469a75389377bff60c9b7cb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_lstm_rl_module.py
@@ -0,0 +1,102 @@
+"""Example of implementing and configuring a custom (torch) LSTM containing RLModule.
+
+This example:
+    - demonstrates how you can subclass the TorchRLModule base class and set up your
+    own LSTM-containing NN architecture by overriding the `setup()` method.
+    - shows how to override the 3 forward methods: `_forward_inference()`,
+    `_forward_exploration()`, and `forward_train()` to implement your own custom forward
+    logic(s), including how to handle STATE in- and outputs to and from these calls.
+    - explains when each of these 3 methods is called by RLlib or the users of your
+    RLModule.
+    - shows how you then configure an RLlib Algorithm such that it uses your custom
+    RLModule (instead of a default RLModule).
+
+We implement a simple LSTM layer here, followed by a series of Linear layers.
+After the last Linear layer, we add fork of 2 Linear (non-activated) layers, one for the
+action logits and one for the value function output.
+
+We test the LSTM containing RLModule on the StatelessCartPole environment, a variant
+of CartPole that is non-Markovian (partially observable). Only an RNN-network can learn
+a decent policy in this environment due to the lack of any velocity information. By
+looking at one observation, one cannot know whether the cart is currently moving left or
+right and whether the pole is currently moving up or down).
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see the following output (during the experiment) in your console:
+
+"""
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole
+from ray.rllib.examples.rl_modules.classes.lstm_containing_rlm import (
+    LSTMContainingRLModule,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+parser = add_rllib_example_script_args(
+    default_reward=300.0,
+    default_timesteps=2000000,
+)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.num_agents == 0:
+        register_env("env", lambda cfg: StatelessCartPole())
+    else:
+        register_env("env", lambda cfg: MultiAgentStatelessCartPole(cfg))
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            env="env",
+            env_config={"num_agents": args.num_agents},
+        )
+        .training(
+            train_batch_size_per_learner=1024,
+            num_epochs=6,
+            lr=0.0009,
+            vf_loss_coeff=0.001,
+            entropy_coeff=0.0,
+        )
+        .rl_module(
+            # Plug-in our custom RLModule class.
+            rl_module_spec=RLModuleSpec(
+                module_class=LSTMContainingRLModule,
+                # Feel free to specify your own `model_config` settings below.
+                # The `model_config` defined here will be available inside your
+                # custom RLModule class through the `self.model_config`
+                # property.
+                model_config={
+                    "lstm_cell_size": 256,
+                    "dense_layers": [256, 256],
+                    "max_seq_len": 20,
+                },
+            ),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..21b68184051f7be95d5ea06ca80ec1032cbd998c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py
@@ -0,0 +1,77 @@
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core import DEFAULT_POLICY_ID
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.rl_modules.classes.modelv2_to_rlm import ModelV2ToRLModule
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+)
+
+
+if __name__ == "__main__":
+    # Configure an old stack default ModelV2.
+    config_old_stack = (
+        PPOConfig()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .environment("CartPole-v1")
+        .training(
+            lr=0.0003,
+            num_sgd_iter=6,
+            vf_loss_coeff=0.01,
+            # Change the ModelV2 settings a bit.
+            model={
+                "fcnet_hiddens": [32],
+                "fcnet_activation": "linear",
+                "use_lstm": True,
+                "vf_share_layers": True,
+            },
+        )
+    )
+
+    # Training with the (configured and wrapped) ModelV2.
+
+    # We change the original (old API stack) `config` into a new API stack one:
+    config_new_stack = (
+        config_old_stack.copy(copy_frozen=False)
+        .api_stack(
+            enable_rl_module_and_learner=True,
+            enable_env_runner_and_connector_v2=True,
+        )
+        .rl_module(
+            rl_module_spec=RLModuleSpec(
+                module_class=ModelV2ToRLModule,
+                model_config={
+                    "policy_id": DEFAULT_POLICY_ID,
+                    "old_api_stack_algo_config": config_old_stack,
+                    "max_seq_len": 20,
+                },
+            ),
+        )
+    )
+
+    # Build the new stack algo.
+    algo_new_stack = config_new_stack.build()
+
+    # Train until a higher return.
+    min_return_new_stack = 350.0
+    results = None
+    passed = False
+    for i in range(100):
+        results = algo_new_stack.train()
+        print(results)
+        if results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_return_new_stack:
+            print(
+                f"Reached episode return of {min_return_new_stack} -> stopping "
+                "new API stack training."
+            )
+            passed = True
+            break
+
+    if not passed:
+        raise ValueError(
+            "Continuing training on the new stack did not succeed! Last return: "
+            f"{results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}"
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac6ef471cb9552e6e1307c60c73db65060f759af
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py
@@ -0,0 +1,118 @@
+import pathlib
+
+import gymnasium as gym
+import numpy as np
+import torch
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.rl_modules.classes.modelv2_to_rlm import ModelV2ToRLModule
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+)
+from ray.rllib.utils.spaces.space_utils import batch
+
+
+if __name__ == "__main__":
+    # Configure and train an old stack default ModelV2.
+    config = (
+        PPOConfig()
+        # Old API stack.
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .environment("CartPole-v1")
+        .training(
+            lr=0.0003,
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+        )
+    )
+    algo_old_stack = config.build()
+
+    min_return_old_stack = 100.0
+    while True:
+        results = algo_old_stack.train()
+        print(results)
+        if results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_return_old_stack:
+            print(
+                f"Reached episode return of {min_return_old_stack} -> stopping "
+                "old API stack training."
+            )
+            break
+
+    checkpoint = algo_old_stack.save()
+    policy_path = (
+        pathlib.Path(checkpoint.checkpoint.path) / "policies" / "default_policy"
+    )
+    assert policy_path.is_dir()
+    algo_old_stack.stop()
+
+    print("done")
+
+    # Move the old API stack (trained) ModelV2 into the new API stack's RLModule.
+    # Run a simple CartPole inference experiment.
+    env = gym.make("CartPole-v1", render_mode="human")
+    rl_module = ModelV2ToRLModule(
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        model_config={"policy_checkpoint_dir": policy_path},
+    )
+
+    obs, _ = env.reset()
+    env.render()
+    done = False
+    episode_return = 0.0
+    while not done:
+        output = rl_module.forward_inference({"obs": torch.from_numpy(batch([obs]))})
+        action_logits = output["action_dist_inputs"].detach().numpy()[0]
+        action = np.argmax(action_logits)
+        obs, reward, terminated, truncated, _ = env.step(action)
+        done = terminated or truncated
+        episode_return += reward
+        env.render()
+
+    print(f"Ran episode with trained ModelV2: return={episode_return}")
+
+    # Continue training with the (checkpointed) ModelV2.
+
+    # We change the original (old API stack) `config` into a new API stack one:
+    config = config.api_stack(
+        enable_rl_module_and_learner=True,
+        enable_env_runner_and_connector_v2=True,
+    ).rl_module(
+        rl_module_spec=RLModuleSpec(
+            module_class=ModelV2ToRLModule,
+            model_config={"policy_checkpoint_dir": policy_path},
+        ),
+    )
+
+    # Build the new stack algo.
+    algo_new_stack = config.build()
+
+    # Train until a higher return.
+    min_return_new_stack = 450.0
+    passed = False
+    for i in range(50):
+        results = algo_new_stack.train()
+        print(results)
+        # Make sure that the model's weights from the old API stack training
+        # were properly transferred to the new API RLModule wrapper. Thus, even
+        # after only one iteration of new stack training, we already expect the
+        # return to be higher than it was at the end of the old stack training.
+        assert results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_return_old_stack
+        if results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_return_new_stack:
+            print(
+                f"Reached episode return of {min_return_new_stack} -> stopping "
+                "new API stack training."
+            )
+            passed = True
+            break
+
+    if not passed:
+        raise ValueError(
+            "Continuing training on the new stack did not succeed! Last return: "
+            f"{results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}"
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbb35184d7ddc781be5fe56d231b6b00614ce5c4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent.py
@@ -0,0 +1,194 @@
+"""Example of running a single-agent pre-training followed with a multi-agent training.
+
+This examples `num_agents` agents each of them with its own `RLModule` that defines its
+policy. The first agent is pre-trained using a single-agent PPO algorithm. All agents
+are trained together in the main training run using a multi-agent PPO algorithm where
+the pre-trained module is used for the first agent.
+
+The environment is MultiAgentCartPole, in which there are n agents both policies.
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see that the single-agent policy is first trained until
+the specified `--stop-reward-pretraining` value. For example, with the command line:
+`--enable-new-api-stack --num-agents=2 --stop-reward-pretraining=250.0
+--stop-reward=250.0 --stop-iters=3 --as-test`, you should get something like:
++-----------------------+------------+------+----------------+---------------------+
+| Trial name            | status     | iter | total time (s) | episode_return_mean |
+|                       |            |      |                |                     |
+|-----------------------+------------+------+----------------+---------------------+
+| PPO_CartPole-v1_00000 | TERMINATED |   16 |        25.6009 |               256.2 |
++-----------------------+------------+------+----------------+---------------------+
+
+Then, in the second experiment, where we run in a multi-agent setup with two policies
+("p0" from the single-agent checkpoint and "p1" randomly initialized), you can see that
+only "p0" immediately (after 1 iteration) reaches the same episode return as at the end
+of pretraining:
++----------------------------+------------+--------+------------------+------+
+| Trial name                 | status     |   iter |   total time (s) |   ts |
+|----------------------------+------------+--------+------------------+------+
+| PPO_multi-cart_6274d_00000 | TERMINATED |      1 |          2.71681 | 4000 |
++----------------------------+------------+--------+------------------+------+
++-------------------+-------------+-------------+
+|   combined return |   return p0 |   return p1 |
+|-------------------+-------------|-------------+
+|           451.625 |     433.125 |        18.5 |
++-------------------+-------------+-------------+
+"""
+from pathlib import Path
+
+import gymnasium as gym
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog
+from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule
+from ray.rllib.core import (
+    COMPONENT_LEARNER,
+    COMPONENT_LEARNER_GROUP,
+    COMPONENT_RL_MODULE,
+    DEFAULT_MODULE_ID,
+)
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune import register_env
+
+# Read in common example script command line arguments.
+parser = add_rllib_example_script_args(
+    # Use less training steps for the main training run.
+    default_timesteps=50000,
+    default_reward=200.0,
+    default_iters=20,
+)
+parser.set_defaults(
+    checkpoint_freq=1,
+    checkpoint_at_end=True,
+)
+parser.add_argument(
+    "--stop-reward-pretraining",
+    type=float,
+    default=250.0,
+    help="The min. episode return to reach during pre-training.",
+)
+
+
+if __name__ == "__main__":
+
+    # Parse the command line arguments.
+    args = parser.parse_args()
+
+    # Ensure that the user has set the number of agents.
+    if args.num_agents == 0:
+        raise ValueError(
+            "This pre-training example script requires at least 1 agent. "
+            "Try setting the command line argument `--num-agents` to the "
+            "number of agents you want to use."
+        )
+
+    # Store the user's stopping criteria for the later training run.
+    stop_iters = args.stop_iters
+    stop_timesteps = args.stop_timesteps
+    stop_reward = args.stop_reward
+    num_agents = args.num_agents
+    as_test = args.as_test
+
+    # Override these criteria for the pre-training run.
+    args.stop_iters = 10000
+    args.stop_timesteps = 100000000
+    args.stop_reward = args.stop_reward_pretraining
+    args.num_agents = 0
+    args.as_test = False
+
+    # Define out pre-training single-agent algorithm. We will use the same module
+    # configuration for the pre-training and the training.
+    config = (
+        PPOConfig()
+        .environment("CartPole-v1")
+        .rl_module(
+            # Use a different number of hidden units for the pre-trained module.
+            model_config=DefaultModelConfig(fcnet_hiddens=[64]),
+        )
+    )
+
+    # Run the pre-training.
+    results = run_rllib_example_script_experiment(config, args, keep_ray_up=True)
+    # Get the checkpoint path.
+    module_chkpt_path = (
+        Path(results.get_best_result().checkpoint.path)
+        / COMPONENT_LEARNER_GROUP
+        / COMPONENT_LEARNER
+        / COMPONENT_RL_MODULE
+        / DEFAULT_MODULE_ID
+    )
+    assert module_chkpt_path.is_dir()
+
+    # Restore the user's stopping criteria for the training run.
+    args.stop_iters = stop_iters
+    args.stop_timesteps = stop_timesteps
+    args.stop_reward = stop_reward
+    args.num_agents = num_agents
+    args.as_test = as_test
+
+    # Create a new MultiRLModule using the pre-trained module for policy 0.
+    env = gym.make("CartPole-v1")
+    module_specs = {}
+    module_class = PPOTorchRLModule
+    for i in range(args.num_agents):
+        module_specs[f"p{i}"] = RLModuleSpec(
+            module_class=PPOTorchRLModule,
+            observation_space=env.observation_space,
+            action_space=env.action_space,
+            model_config=DefaultModelConfig(fcnet_hiddens=[32]),
+            catalog_class=PPOCatalog,
+        )
+
+    # Swap in the pre-trained module for policy 0.
+    module_specs["p0"] = RLModuleSpec(
+        module_class=PPOTorchRLModule,
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        model_config=DefaultModelConfig(fcnet_hiddens=[64]),
+        catalog_class=PPOCatalog,
+        # Note, we load here the module directly from the checkpoint.
+        load_state_path=module_chkpt_path,
+    )
+    multi_rl_module_spec = MultiRLModuleSpec(rl_module_specs=module_specs)
+
+    # Register our environment with tune if we use multiple agents.
+    register_env(
+        "multi-cart",
+        lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}),
+    )
+
+    # Configure the main (multi-agent) training run.
+    config = (
+        PPOConfig()
+        .environment("multi-cart")
+        .multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, eps, **kw: f"p{aid}",
+        )
+        .rl_module(rl_module_spec=multi_rl_module_spec)
+    )
+
+    # Run the main training run.
+    run_rllib_example_script_experiment(config, args)