diff --git a/.gitattributes b/.gitattributes
index 7c997f5173693385cf8f34df08526d910db5986d..ec645e70edaa1c0663dd5b1921f3bf52b115fb6d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -176,3 +176,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b04dfbb8a08c1dffed33929fe7c79a26bfc9eec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95af77c4d005c6313bfda251ea72e8f78077efdc496424adc38b2ccc523d6af7
+size 171335
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1887da130aab67c6a2c573bc84e37897b68da1ea
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/curriculum_learning.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/curriculum_learning.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef110b064bd31d1a83ab2780125e572575af9a87
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/__pycache__/curriculum_learning.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/curriculum_learning.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/curriculum_learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..d08cc35c224ff85b5a19d854942465ae127a6809
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curriculum/curriculum_learning.py
@@ -0,0 +1,241 @@
+"""Example of using an env-task curriculum by implementing a custom callback.
+
+This example:
+    - demonstrates how to define your own curriculum-capable environments using
+    gymnasium's FrozenLake env.
+    - defines a custom callback that gets called once per iteration and - if necessary -
+    changes the maps used by FrozenLake on all EnvRunners to a new task (by moving the
+    goal position further and further away from the starting position).
+    - also demonstrates an alternative approach via reloading/recreating an entirely new
+    env inside all EnvRunners.
+    - uses Tune and RLlib to curriculum-learn the env described above and compares 2
+    algorithms, one that does use curriculum learning vs one that does not.
+
+We use a FrozenLake (sparse reward) environment with a map size of 8x8 and a time step
+limit of 16 to make it almost impossible for a non-curriculum policy to learn.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+
+Use the `--no-curriculum` flag to disable curriculum learning and force your policy
+to be trained on the hardest task right away. With this option, the algorithm should NOT
+succeed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see that only PPO policy that uses a curriculum can
+actually learn, whereas the one that is thrown into the toughest task right from the
+start never learns anything.
+
+Policy using the curriculum:
++-------------------------------+------------+-----------------+--------+
+| Trial name                    | status     | loc             |   iter |
+|-------------------------------+------------+-----------------+--------+
+| PPO_FrozenLake-v1_93ca4_00000 | TERMINATED | 127.0.0.1:73318 |     41 |
++-------------------------------+------------+-----------------+--------+
++------------------+--------+----------+--------------------+
+|   total time (s) |     ts |   reward |   episode_len_mean |
+|------------------+--------+----------+--------------------|
+|           97.652 | 164000 |        1 |            14.0348 |
++------------------+--------+----------+--------------------+
+
+Policy NOT using the curriculum (trying to solve the hardest task right away):
+[DOES NOT LEARN AT ALL]
+"""
+from functools import partial
+
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+parser = add_rllib_example_script_args(default_iters=100, default_timesteps=600000)
+parser.set_defaults(enable_new_api_stack=True)
+parser.add_argument(
+    "--upgrade-task-threshold",
+    type=float,
+    default=0.99,
+    help="The mean episode return, upon reaching of which we increase the task by one.",
+)
+parser.add_argument(
+    "--no-curriculum",
+    action="store_true",
+    help="Whether to NOT use curriculum learning (and instead trying to solve the "
+    "hardest task right away).",
+)
+
+# __curriculum_learning_example_env_options__
+ENV_OPTIONS = {
+    "is_slippery": False,
+    # Limit the number of steps the agent is allowed to make in the env to
+    # make it almost impossible to learn without the curriculum.
+    "max_episode_steps": 16,
+}
+
+# Our 3 tasks: 0=easiest, 1=medium, 2=hard
+ENV_MAPS = [
+    # 0
+    [
+        "SFFHFFFH",
+        "FFFHFFFF",
+        "FFGFFFFF",
+        "FFFFFFFF",
+        "HFFFFFFF",
+        "HHFFFFHF",
+        "FFFFFHHF",
+        "FHFFFFFF",
+    ],
+    # 1
+    [
+        "SFFHFFFH",
+        "FFFHFFFF",
+        "FFFFFFFF",
+        "FFFFFFFF",
+        "HFFFFFFF",
+        "HHFFGFHF",
+        "FFFFFHHF",
+        "FHFFFFFF",
+    ],
+    # 2
+    [
+        "SFFHFFFH",
+        "FFFHFFFF",
+        "FFFFFFFF",
+        "FFFFFFFF",
+        "HFFFFFFF",
+        "HHFFFFHF",
+        "FFFFFHHF",
+        "FHFFFFFG",
+    ],
+]
+# __END_curriculum_learning_example_env_options__
+
+
+# Simple function sent to an EnvRunner to change the map of all its gym. Envs from
+# the current one to a new (tougher) one, in which the goal position is further away
+# from the starting position. Note that a map is a list of strings, each one
+# representing one row in the map. Each character in the strings represent a single
+# field (S=starting position, H=hole (bad), F=frozen/free field (ok), G=goal (great!)).
+def _remote_fn(env_runner, new_task: int):
+    # We recreate the entire env object by changing the env_config on the worker,
+    # then calling its `make_env()` method.
+    env_runner.config.environment(env_config={"desc": ENV_MAPS[new_task]})
+    env_runner.make_env()
+
+
+class EnvTaskCallback(RLlibCallback):
+    """Custom callback implementing `on_train_result()` for changing the envs' maps."""
+
+    def on_train_result(
+        self,
+        *,
+        algorithm: Algorithm,
+        metrics_logger=None,
+        result: dict,
+        **kwargs,
+    ) -> None:
+        # Hack: Store the current task inside a counter in our Algorithm.
+        # W/o a curriculum, the task is always 2 (hardest).
+        if args.no_curriculum:
+            algorithm._counters["current_env_task"] = 2
+        current_task = algorithm._counters["current_env_task"]
+
+        # If episode return is consistently `args.upgrade_task_threshold`, we switch
+        # to a more difficult task (if possible). If we already mastered the most
+        # difficult task, we publish our victory in the result dict.
+        result["task_solved"] = 0.0
+        current_return = result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        if current_return > args.upgrade_task_threshold:
+            if current_task < 2:
+                new_task = current_task + 1
+                print(
+                    f"Switching task/map on all EnvRunners to #{new_task} (0=easiest, "
+                    f"2=hardest), b/c R={current_return} on current task."
+                )
+                algorithm.env_runner_group.foreach_env_runner(
+                    func=partial(_remote_fn, new_task=new_task)
+                )
+                algorithm._counters["current_env_task"] = new_task
+
+            # Hardest task was solved (1.0) -> report this in the results dict.
+            elif current_return == 1.0:
+                result["task_solved"] = 1.0
+        # Emergency brake: If return is 0.0 AND we are already at a harder task (1 or
+        # 2), we go back to task=0.
+        elif current_return == 0.0 and current_task > 0:
+            print(
+                "Emergency brake: Our policy seemed to have collapsed -> Setting task "
+                "back to 0."
+            )
+            algorithm.env_runner_group.foreach_env_runner(
+                func=partial(_remote_fn, new_task=0)
+            )
+            algorithm._counters["current_env_task"] = 0
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        # Plug in our curriculum callbacks that controls when we should upgrade the env
+        # task based on the received return for the current task.
+        .callbacks(EnvTaskCallback)
+        .environment(
+            "FrozenLake-v1",
+            env_config={
+                # w/ curriculum: start with task=0 (easiest)
+                # w/o curriculum: start directly with hardest task 2.
+                "desc": ENV_MAPS[2 if args.no_curriculum else 0],
+                **ENV_OPTIONS,
+            },
+        )
+        .env_runners(
+            num_envs_per_env_runner=5,
+            env_to_module_connector=lambda env: FlattenObservations(),
+        )
+        .training(
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+            lr=0.0002,
+        )
+        .rl_module(model_config=DefaultModelConfig(vf_share_layers=True))
+    )
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        # Reward directly does not matter to us as we would like to continue
+        # after the policy reaches a return of ~1.0 on the 0-task (easiest).
+        # But we DO want to stop, once the entire task is learned (policy achieves
+        # return of 1.0 on the most difficult task=2).
+        "task_solved": 1.0,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+    }
+
+    run_rllib_example_script_experiment(
+        base_config, args, stop=stop, success_metric={"task_solved": 1.0}
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f81be9aed26d68a88668b19b5942ee3cb1886ef5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_in_sequence.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_in_sequence.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42098212a26579db6b3e361158ddf1cdba56208d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_in_sequence.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_simultaneously.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_simultaneously.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72481e2c700643d2648b0fc2597f68684b05eeca
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/agents_act_simultaneously.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/async_gym_env_vectorization.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/async_gym_env_vectorization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6df42079bdea8a76efc00031ef217a5b409d51ec
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/async_gym_env_vectorization.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_env_render_method.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_env_render_method.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..977b9d7e92867b8da8bc850a3a19748d72bdbe85
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_env_render_method.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_gym_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_gym_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac045a29111694d1205563ced257e5c0c60e1077
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/custom_gym_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_connecting_to_rllib_w_tcp_client.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_connecting_to_rllib_w_tcp_client.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4bea66b30566d1f3e1a3a443f553a1c0de32660
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_connecting_to_rllib_w_tcp_client.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_rendering_and_recording.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_rendering_and_recording.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..096ff061a6feece80c30718179e48968e5b5bb54
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_rendering_and_recording.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_w_protobuf_observations.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_w_protobuf_observations.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40586d831225ceb8f3f794e095b7de48f17f2020
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/env_w_protobuf_observations.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/greyscale_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/greyscale_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a552734bef76ac02c9f710e6fb3cff760c2fea1e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/greyscale_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/unity3d_env_local.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/unity3d_env_local.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85baecec9f16e1cd5293d69dd816cb58ca81d71b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__pycache__/unity3d_env_local.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2bd6417923f8ac6ca62ca8b6172eb870f3c22ef
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/action_mask_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/action_mask_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98f279b2e1a66ac90f03af28be932c99d41d46a1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/action_mask_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_crashing.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_crashing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ca98e8880ff45f523129945347b6313c8ae8b71
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_crashing.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_sparse_rewards.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_sparse_rewards.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84a58ad159bb6184b81b592b00d1854f04425168
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_sparse_rewards.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_dict_observation_space.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_dict_observation_space.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ec84727b993ae90be8548731fdfa2eb4a079e0f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_dict_observation_space.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_large_observation_space.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_large_observation_space.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75b4a5cfadb0394f6d7cf315f18e547b00defe64
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_large_observation_space.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_protobuf_observation_space.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_protobuf_observation_space.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d2780a8e4831838aa192ed510107bcd3c8ad174
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cartpole_with_protobuf_observation_space.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cliff_walking_wall_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cliff_walking_wall_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87b754cc886e6213e1c8c3de2461e5c5a31afda0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/cliff_walking_wall_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/correlated_actions_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/correlated_actions_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..599f3508b833ecedbe10f78e54f84236cbd87d35
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/correlated_actions_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/d4rl_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/d4rl_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7dd0507c4a9ef46a21e9ad8dcae504d157196e8b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/d4rl_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/debug_counter_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/debug_counter_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed37a8dc452d72468435eb5f0f9c9b07539498fb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/debug_counter_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/deterministic_envs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/deterministic_envs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..603718eaf7e770b11a12221f9154e188dc4f8612
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/deterministic_envs.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/dm_control_suite.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/dm_control_suite.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..707ebf9b87305a1dde7ed2686c007f53e2945d3e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/dm_control_suite.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_using_remote_actor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_using_remote_actor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d623674f65480df7dd1f4b0c21349245f379f487
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_using_remote_actor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_with_subprocess.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_with_subprocess.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d8580fd9958e961485a66c3673e22d613f63887
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/env_with_subprocess.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/fast_image_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/fast_image_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c5728ef75df33ab0930bfc1e356342ba4ed9088
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/fast_image_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/gpu_requiring_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/gpu_requiring_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5c1a2422f1ed7cfd11b41b43714ce8668346b8c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/gpu_requiring_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/look_and_push.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/look_and_push.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fd7bf9602dbe2f01ad7d9631d1baae14fb2caa6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/look_and_push.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/memory_leaking_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/memory_leaking_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ec4a3e7de5a8cf557241bb2ddae9da4d31af519
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/memory_leaking_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/mock_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/mock_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0a41463cf6fdcb29515372800a50ce336a8024b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/mock_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/nested_space_repeat_after_me_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/nested_space_repeat_after_me_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45c55b5cedc185d059044c2d34cc4598dc0b3f98
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/nested_space_repeat_after_me_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/parametric_actions_cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/parametric_actions_cartpole.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e602845a956faf98077c425960a7291b0dbf0ac2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/parametric_actions_cartpole.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/random_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/random_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75f962a7e88f0be7e983a7cd5311ba87676ea3ec
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/random_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/recommender_system_envs_with_recsim.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/recommender_system_envs_with_recsim.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8986e41eeb8212d70fe3f5ccbb80a6ebb76587e3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/recommender_system_envs_with_recsim.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_after_me_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_after_me_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42a65ccacb41a11499bccd048a42ca1078bb19cd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_after_me_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_initial_obs_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_initial_obs_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fad10cce2af701a0ff6bebc0ea89413f96287afd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/repeat_initial_obs_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_corridor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_corridor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c665aacd5553753ec9ee14905e18ef56e9be962b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_corridor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_rpg.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_rpg.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6475c164a1793f511a23004aefaab3f08f9812b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/simple_rpg.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/six_room_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/six_room_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27258650a7f6d3dbdbfca24e88479214ba46a5ae
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/six_room_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_cartpole.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da91cd07e69c142b9b5cd09558c82a1c3ee6ecb7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_cartpole.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_pendulum.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_pendulum.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b271367bdd472e41ca33770f1424575df36db00
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/stateless_pendulum.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/transformed_action_space_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/transformed_action_space_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cceda54841019922d62cacb859d2aa938546f0e3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/transformed_action_space_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/windy_maze_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/windy_maze_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b13e87e07521ee871e660e3f01b1a854a8732e36
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__pycache__/windy_maze_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7fb660ccd462981f7ace066fceacb8556f8a356
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__init__.py
@@ -0,0 +1,35 @@
+from ray.rllib.env.multi_agent_env import make_multi_agent
+from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import (
+    CartPoleWithDictObservationSpace,
+)
+from ray.rllib.examples.envs.classes.multi_agent.guess_the_number_game import (
+    GuessTheNumberGame,
+)
+from ray.rllib.examples.envs.classes.multi_agent.two_step_game import (
+    TwoStepGame,
+    TwoStepGameWithGroupedAgents,
+)
+from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import (
+    NestedSpaceRepeatAfterMeEnv,
+)
+from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
+
+
+# Backward compatibility.
+__all__ = [
+    "GuessTheNumberGame",
+    "TwoStepGame",
+    "TwoStepGameWithGroupedAgents",
+]
+
+
+MultiAgentCartPole = make_multi_agent("CartPole-v1")
+MultiAgentMountainCar = make_multi_agent("MountainCarContinuous-v0")
+MultiAgentPendulum = make_multi_agent("Pendulum-v1")
+MultiAgentStatelessCartPole = make_multi_agent(lambda config: StatelessCartPole(config))
+MultiAgentCartPoleWithDictObservationSpace = make_multi_agent(
+    lambda config: CartPoleWithDictObservationSpace(config)
+)
+MultiAgentNestedSpaceRepeatAfterMeEnv = make_multi_agent(
+    lambda config: NestedSpaceRepeatAfterMeEnv(config)
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c57f046204dbebbd6f0fabb92b05d4b2dca2f34
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_discrete.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_discrete.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d07911bdc75d17a397b91aa0ee4b43366387dfaf
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_discrete.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_recommender_system.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_recommender_system.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d398a40c0466533e3c7db06b3cb3e84afe2c1200
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/bandit_envs_recommender_system.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/guess_the_number_game.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/guess_the_number_game.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdb2e28f630565de10f26b765fc339af1f18aa1b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/guess_the_number_game.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_chess.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_chess.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc8f4b127e2dec7d59b978d3c334b41fc08574c2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_chess.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_connect4.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_connect4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88d5c774b0296898181a1d7bd980a112190353ea
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/pettingzoo_connect4.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/rock_paper_scissors.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/rock_paper_scissors.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..deb3c6734428d5cb1a338a6163e23c4e22ae224a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/rock_paper_scissors.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/tic_tac_toe.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/tic_tac_toe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ca364d09c4a4a01eef87b5fc175c20ef29148a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/tic_tac_toe.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/two_step_game.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/two_step_game.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5d0c3aa4467e15950c3a97f9a2719006f28239f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/__pycache__/two_step_game.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_recommender_system.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_recommender_system.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a29082a0d3352d96dd7cbace9b46a88f24e225
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_recommender_system.py
@@ -0,0 +1,227 @@
+"""Examples for recommender system simulating envs ready to be used by RLlib Algorithms.
+
+This env follows RecSim obs and action APIs.
+"""
+import gymnasium as gym
+import numpy as np
+from typing import Optional
+
+from ray.rllib.utils.numpy import softmax
+
+
+class ParametricRecSys(gym.Env):
+    """A recommendation environment which generates items with visible features
+    randomly (parametric actions).
+    The environment can be configured to be multi-user, i.e. different models
+    will be learned independently for each user, by setting num_users_in_db
+    parameter.
+    To enable slate recommendation, the `slate_size` config parameter can be
+    set as > 1.
+    """
+
+    def __init__(
+        self,
+        embedding_size: int = 20,
+        num_docs_to_select_from: int = 10,
+        slate_size: int = 1,
+        num_docs_in_db: Optional[int] = None,
+        num_users_in_db: Optional[int] = None,
+        user_time_budget: float = 60.0,
+    ):
+        """Initializes a ParametricRecSys instance.
+
+        Args:
+            embedding_size: Embedding size for both users and docs.
+                Each value in the user/doc embeddings can have values between
+                -1.0 and 1.0.
+            num_docs_to_select_from: The number of documents to present to the
+                agent each timestep. The agent will then have to pick a slate
+                out of these.
+            slate_size: The size of the slate to recommend to the user at each
+                timestep.
+            num_docs_in_db: The total number of documents in the DB. Set this
+                to None, in case you would like to resample docs from an
+                infinite pool.
+            num_users_in_db: The total number of users in the DB. Set this to
+                None, in case you would like to resample users from an infinite
+                pool.
+            user_time_budget: The total time budget a user has throughout an
+                episode. Once this time budget is used up (through engagements
+                with clicked/selected documents), the episode ends.
+        """
+        self.embedding_size = embedding_size
+        self.num_docs_to_select_from = num_docs_to_select_from
+        self.slate_size = slate_size
+
+        self.num_docs_in_db = num_docs_in_db
+        self.docs_db = None
+        self.num_users_in_db = num_users_in_db
+        self.users_db = None
+        self.current_user = None
+
+        self.user_time_budget = user_time_budget
+        self.current_user_budget = user_time_budget
+
+        self.observation_space = gym.spaces.Dict(
+            {
+                # The D docs our agent sees at each timestep.
+                # It has to select a k-slate out of these.
+                "doc": gym.spaces.Dict(
+                    {
+                        str(i): gym.spaces.Box(
+                            -1.0, 1.0, shape=(self.embedding_size,), dtype=np.float32
+                        )
+                        for i in range(self.num_docs_to_select_from)
+                    }
+                ),
+                # The user engaging in this timestep/episode.
+                "user": gym.spaces.Box(
+                    -1.0, 1.0, shape=(self.embedding_size,), dtype=np.float32
+                ),
+                # For each item in the previous slate, was it clicked?
+                # If yes, how long was it being engaged with (e.g. watched)?
+                "response": gym.spaces.Tuple(
+                    [
+                        gym.spaces.Dict(
+                            {
+                                # Clicked or not?
+                                "click": gym.spaces.Discrete(2),
+                                # Engagement time (how many minutes watched?).
+                                "engagement": gym.spaces.Box(
+                                    0.0, 100.0, shape=(), dtype=np.float32
+                                ),
+                            }
+                        )
+                        for _ in range(self.slate_size)
+                    ]
+                ),
+            }
+        )
+        # Our action space is
+        self.action_space = gym.spaces.MultiDiscrete(
+            [self.num_docs_to_select_from for _ in range(self.slate_size)]
+        )
+
+    def _get_embedding(self):
+        return np.random.uniform(-1, 1, size=(self.embedding_size,)).astype(np.float32)
+
+    def reset(self, *, seed=None, options=None):
+        # Reset the current user's time budget.
+        self.current_user_budget = self.user_time_budget
+
+        # Sample a user for the next episode/session.
+        # Pick from a only-once-sampled user DB.
+        if self.num_users_in_db is not None:
+            if self.users_db is None:
+                self.users_db = [
+                    self._get_embedding() for _ in range(self.num_users_in_db)
+                ]
+            self.current_user = self.users_db[np.random.choice(self.num_users_in_db)]
+        # Pick from an infinite pool of users.
+        else:
+            self.current_user = self._get_embedding()
+
+        return self._get_obs(), {}
+
+    def step(self, action):
+        # Action is the suggested slate (indices of the docs in the
+        # suggested ones).
+
+        # We calculate scores as the dot product between document features and user
+        # features. The softmax ensures regret<1 further down.
+        scores = softmax(
+            [np.dot(self.current_user, doc) for doc in self.currently_suggested_docs]
+        )
+        best_reward = np.max(scores)
+
+        # User choice model: User picks a doc stochastically,
+        # where probs are dot products between user- and doc feature
+        # (categories) vectors (rewards).
+        # There is also a no-click doc whose weight is 0.0.
+        user_doc_overlaps = np.array([scores[a] for a in action] + [0.0])
+        # We have to softmax again so that probabilities add up to 1
+        probabilities = softmax(user_doc_overlaps)
+        which_clicked = np.random.choice(
+            np.arange(self.slate_size + 1), p=probabilities
+        )
+
+        reward = 0.0
+        if which_clicked < self.slate_size:
+            # Reward is 1.0 - regret if clicked. 0.0 if not clicked.
+            regret = best_reward - user_doc_overlaps[which_clicked]
+            # The reward also represents the user engagement that we define to be
+            # withing the range [0...100].
+            reward = (1 - regret) * 100
+            # If anything clicked, deduct from the current user's time budget.
+            self.current_user_budget -= 1.0
+        done = truncated = self.current_user_budget <= 0.0
+
+        # Compile response.
+        response = tuple(
+            {
+                "click": int(idx == which_clicked),
+                "engagement": reward if idx == which_clicked else 0.0,
+            }
+            for idx in range(len(user_doc_overlaps) - 1)
+        )
+
+        return self._get_obs(response=response), reward, done, truncated, {}
+
+    def _get_obs(self, response=None):
+        # Sample D docs from infinity or our pre-existing docs.
+        # Pick from a only-once-sampled docs DB.
+        if self.num_docs_in_db is not None:
+            if self.docs_db is None:
+                self.docs_db = [
+                    self._get_embedding() for _ in range(self.num_docs_in_db)
+                ]
+            self.currently_suggested_docs = [
+                self.docs_db[doc_idx].astype(np.float32)
+                for doc_idx in np.random.choice(
+                    self.num_docs_in_db,
+                    size=(self.num_docs_to_select_from,),
+                    replace=False,
+                )
+            ]
+        # Pick from an infinite pool of docs.
+        else:
+            self.currently_suggested_docs = [
+                self._get_embedding() for _ in range(self.num_docs_to_select_from)
+            ]
+
+        doc = {str(i): d for i, d in enumerate(self.currently_suggested_docs)}
+
+        if not response:
+            response = self.observation_space["response"].sample()
+
+        return {
+            "user": self.current_user.astype(np.float32),
+            "doc": doc,
+            "response": response,
+        }
+
+
+if __name__ == "__main__":
+    """Test RecommSys env with random actions for baseline performance."""
+    env = ParametricRecSys(
+        num_docs_in_db=100,
+        num_users_in_db=1,
+    )
+    obs, info = env.reset()
+    num_episodes = 0
+    episode_rewards = []
+    episode_reward = 0.0
+
+    while num_episodes < 100:
+        action = env.action_space.sample()
+        obs, reward, done, truncated, _ = env.step(action)
+
+        episode_reward += reward
+        if done:
+            print(f"episode reward = {episode_reward}")
+            env.reset()
+            num_episodes += 1
+            episode_rewards.append(episode_reward)
+            episode_reward = 0.0
+
+    print(f"Avg reward={np.mean(episode_rewards)}")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da1c6386f6c9bc5ddd8bd467aaab583d3d4d9f60
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/cartpole_observations_proto.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/cartpole_observations_proto.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebc58ec49eb381010cc3aeff8b1157339569ece5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/__pycache__/cartpole_observations_proto.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/cartpole_observations_proto.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/cartpole_observations_proto.py
new file mode 100644
index 0000000000000000000000000000000000000000..15b30f5b0b13680f3c4c4e8eba039cd038accd23
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/utils/cartpole_observations_proto.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: cartpole_observations.proto
+# Protobuf Python Version: 5.26.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x1b\x63\x61rtpole_observations.proto"]\n\x13\x43\x61rtPoleObservation\x12\r\n\x05x_pos\x18\x01 \x01(\x01\x12\x0f\n\x07x_veloc\x18\x02 \x01(\x01\x12\x11\n\tangle_pos\x18\x03 \x01(\x01\x12\x13\n\x0b\x61ngle_veloc\x18\x04 \x01(\x01\x62\x06proto3'  # noqa
+)
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(
+    DESCRIPTOR, "cartpole_observations_proto", _globals
+)
+if not _descriptor._USE_C_DESCRIPTORS:
+    DESCRIPTOR._loaded_options = None
+    _globals["_CARTPOLEOBSERVATION"]._serialized_start = 31
+    _globals["_CARTPOLEOBSERVATION"]._serialized_end = 124
+# @@protoc_insertion_point(module_scope)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f605e4be7d829ae116df58f7857b3c5a8ddc3506
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/crashing_and_stalling_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/crashing_and_stalling_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41e6febe68ed39736784091afae50aa9a5c898df
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/__pycache__/crashing_and_stalling_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/crashing_and_stalling_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/crashing_and_stalling_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..4425d51d5d9e59e7ef41137d826885bf53f58976
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/fault_tolerance/crashing_and_stalling_env.py
@@ -0,0 +1,176 @@
+"""Example demonstrating that RLlib can learn (at scale) in unstable environments.
+
+This script uses the `CartPoleCrashing` environment, an adapted cartpole env whose
+instability is configurable through setting the probability of a crash and/or stall
+(sleep for a configurable amount of time) during `reset()` and/or `step()`.
+
+RLlib has two major flags for EnvRunner fault tolerance, which can be independently
+set to True:
+1) `config.fault_tolerance(restart_failed_sub_environments=True)` causes only the
+(gymnasium) environment object on an EnvRunner to be closed (try calling `close()` on
+the faulty object), garbage collected, and finally recreated from scratch. Note that
+during this process, the containing EnvRunner remaing up and running and sampling
+simply continues after the env recycling. This is the lightest and fastest form of
+fault tolerance and should be attempted first.
+2) `config.fault_tolerance(restart_failed_env_runners=True)` causes the entire
+EnvRunner (a Ray remote actor) to be restarted. This restart logically includes the
+gymnasium environment, the RLModule, and all connector pipelines on the EnvRunner.
+Use this option only if you face problems with the first option
+(restart_failed_sub_environments=True), such as incomplete cleanups and memory leaks.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+
+You can switch on the fault tolerant behavior (1) (restart_failed_sub_environments)
+through the `--restart-failed-envs` flag. If this flag is not set, the script will
+recreate the entire (faulty) EnvRunner.
+
+You can switch on stalling (besides crashing) through the `--stall` command line flag.
+If set, besides crashing on `reset()` and/or `step()`, there is also a chance of
+stalling for a few seconds on each of these events.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should see the following (or very similar) console output when running this script
+with:
+`--algo=PPO --stall --restart-failed-envs --stop-reward=450.0`
++---------------------+------------+----------------+--------+------------------+
+| Trial name          | status     | loc            |   iter |   total time (s) |
+|                     |            |                |        |                  |
+|---------------------+------------+----------------+--------+------------------+
+| PPO_env_ba39b_00000 | TERMINATED | 127.0.0.1:1401 |     22 |          133.497 |
++---------------------+------------+----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|    episode_return_mean |   num_episodes_lifetim |   num_env_steps_traine |
+|                        |                      e |             d_lifetime |
+|------------------------+------------------------+------------------------|
+|                 450.24 |                    542 |                  88628 |
++------------------------+------------------------+------------------------+
+
+For APPO and testing restarting the entire EnvRunners, you could run the script with:
+`--algo=APPO --stall --stop-reward=450.0`
++----------------------+------------+----------------+--------+------------------+
+| Trial name           | status     | loc            |   iter |   total time (s) |
+|                      |            |                |        |                  |
+|----------------------+------------+----------------+--------+------------------+
+| APPO_env_ba39b_00000 | TERMINATED | 127.0.0.1:4653 |     10 |          101.531 |
++----------------------+------------+----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|    episode_return_mean |   num_episodes_lifetim |   num_env_steps_traine |
+|                        |                      e |             d_lifetime |
+|------------------------+------------------------+------------------------|
+|                 478.85 |                   2546 |                 321500 |
++------------------------+------------------------+------------------------+
+"""
+from gymnasium.wrappers import TimeLimit
+
+from ray import tune
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.cartpole_crashing import (
+    CartPoleCrashing,
+    MultiAgentCartPoleCrashing,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+parser = add_rllib_example_script_args(
+    default_reward=450.0,
+    default_timesteps=2000000,
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_env_runners=4,
+)
+# Use `parser` to add your own custom command line options to this script
+# and (if needed) use their values to set up `config` below.
+parser.add_argument(
+    "--stall",
+    action="store_true",
+    help="Whether to also stall the env from time to time",
+)
+parser.add_argument(
+    "--restart-failed-envs",
+    action="store_true",
+    help="Whether to restart a failed environment (vs restarting the entire "
+    "EnvRunner).",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        tune.register_env("env", lambda cfg: MultiAgentCartPoleCrashing(cfg))
+    else:
+        tune.register_env(
+            "env",
+            lambda cfg: TimeLimit(CartPoleCrashing(cfg), max_episode_steps=500),
+        )
+
+    base_config = (
+        tune.registry.get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "env",
+            env_config={
+                "num_agents": args.num_agents,
+                # Probability to crash during step().
+                "p_crash": 0.0001,
+                # Probability to crash during reset().
+                "p_crash_reset": 0.001,
+                "crash_on_worker_indices": [1, 2],
+                "init_time_s": 2.0,
+                # Probability to stall during step().
+                "p_stall": 0.0005,
+                # Probability to stall during reset().
+                "p_stall_reset": 0.001,
+                # Stall from 2 to 5sec (or 0.0 if --stall not set).
+                "stall_time_sec": (2, 5) if args.stall else 0.0,
+                # EnvRunner indices to stall on.
+                "stall_on_worker_indices": [2, 3],
+            },
+        )
+        # Switch on resiliency.
+        .fault_tolerance(
+            # Recreate any failed EnvRunners.
+            restart_failed_env_runners=True,
+            # Restart any failed environment (w/o recreating the EnvRunner). Note that
+            # this is the much faster option.
+            restart_failed_sub_environments=args.restart_failed_envs,
+        )
+    )
+
+    # Use more stabilizing hyperparams for APPO.
+    if args.algo == "APPO":
+        base_config.training(
+            grad_clip=40.0,
+            entropy_coeff=0.0,
+            vf_loss_coeff=0.05,
+        )
+        base_config.rl_module(
+            model_config=DefaultModelConfig(vf_share_layers=True),
+        )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    run_rllib_example_script_experiment(base_config, args=args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c581351ad3dae4018e7086b23272d4d6e2d23755
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/ray_serve_with_rllib.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/ray_serve_with_rllib.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ceb2866549cbee3268998faad232e545d33de87d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/__pycache__/ray_serve_with_rllib.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72995fb401e47b9d387f589d16eb02f7025fc8a0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/cartpole_deployment.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/cartpole_deployment.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79b9b82e1ad446c77fd1253fdee8a8f7408d7567
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/__pycache__/cartpole_deployment.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/cartpole_deployment.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/cartpole_deployment.py
new file mode 100644
index 0000000000000000000000000000000000000000..41686306c09577bb1f41f1205e0f96cde6f8100c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/classes/cartpole_deployment.py
@@ -0,0 +1,50 @@
+import json
+from typing import Dict
+
+import numpy as np
+from starlette.requests import Request
+import torch
+
+from ray import serve
+from ray.rllib.core import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.serve.schema import LoggingConfig
+
+
+@serve.deployment(logging_config=LoggingConfig(log_level="WARN"))
+class ServeRLlibRLModule:
+    """Callable class used by Ray Serve to handle async requests.
+
+    All the necessary serving logic is implemented in here:
+    - Creation and restoring of the (already trained) RLlib Algorithm.
+    - Calls to algo.compute_action upon receiving an action request
+      (with a current observation).
+    """
+
+    def __init__(self, rl_module_checkpoint):
+        self.rl_module = RLModule.from_checkpoint(rl_module_checkpoint)
+
+    async def __call__(self, starlette_request: Request) -> Dict:
+        request = await starlette_request.body()
+        request = request.decode("utf-8")
+        request = json.loads(request)
+        obs = request["observation"]
+
+        # Compute and return the action for the given observation (create a batch
+        # with B=1 and convert to torch).
+        output = self.rl_module.forward_inference(
+            batch={"obs": torch.from_numpy(np.array([obs], np.float32))}
+        )
+        # Extract action logits and unbatch.
+        logits = output[Columns.ACTION_DIST_INPUTS][0]
+        # Act greedily (argmax).
+        action = int(np.argmax(logits))
+
+        return {"action": action}
+
+
+# Defining the builder function. This is so we can start our deployment via:
+# `serve run [this py module]:rl_module checkpoint=[some algo checkpoint path]`
+def rl_module(args: Dict[str, str]):
+    serve.start(http_options={"host": "0.0.0.0", "port": args.get("port", 12345)})
+    return ServeRLlibRLModule.bind(args["rl_module_checkpoint"])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/ray_serve_with_rllib.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/ray_serve_with_rllib.py
new file mode 100644
index 0000000000000000000000000000000000000000..0853151f40fa0b11905531305393ec58c5cc9c25
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_serve/ray_serve_with_rllib.py
@@ -0,0 +1,190 @@
+"""Example on how to run RLlib in combination with Ray Serve.
+
+This example trains an agent with PPO on the CartPole environment, then creates
+an RLModule checkpoint and returns its location. After that, it sends the checkpoint
+to the Serve deployment for serving the trained RLModule (policy).
+
+This example:
+    - shows how to set up a Ray Serve deployment for serving an already trained
+    RLModule (policy network).
+    - shows how to request new actions from the Ray Serve deployment while actually
+    running through episodes in an environment (on which the RLModule that's served
+    was trained).
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --stop-reward=200.0`
+
+Use the `--stop-iters`, `--stop-reward`, and/or `--stop-timesteps` options to
+determine how long to train the policy for. Use the `--serve-episodes` option to
+set the number of episodes to serve (after training) and the `--no-render` option
+to NOT render the environment during the serving phase.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+
+
+Results to expect
+-----------------
+
+You should see something similar to the following on the command line when using the
+options: `--stop-reward=250.0`, `--num-episodes-served=2`, and `--port=12345`:
+
+[First, the RLModule is trained through PPO]
+
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|                             |            |                 |        |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_84778_00000 | TERMINATED | 127.0.0.1:40411 |      1 |
++-----------------------------+------------+-----------------+--------+
++------------------+---------------------+------------------------+
+|   total time (s) | episode_return_mean |   num_env_steps_sample |
+|                  |                     |             d_lifetime |
+|------------------+---------------------|------------------------|
+|          2.87052 |               253.2 |                  12000 |
++------------------+---------------------+------------------------+
+
+[The RLModule is deployed through Ray Serve on port 12345]
+
+Started Ray Serve with PID: 40458
+
+[A few episodes are played through using the policy service (w/ greedy, non-exploratory
+actions)]
+
+Episode R=500.0
+Episode R=500.0
+"""
+
+import atexit
+import os
+
+import requests
+import subprocess
+import time
+
+import gymnasium as gym
+from pathlib import Path
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core import (
+    COMPONENT_LEARNER_GROUP,
+    COMPONENT_LEARNER,
+    COMPONENT_RL_MODULE,
+    DEFAULT_MODULE_ID,
+)
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+parser = add_rllib_example_script_args()
+parser.set_defaults(
+    enable_new_api_stack=True,
+    checkpoint_freq=1,
+    checkpoint_at_and=True,
+)
+parser.add_argument("--num-episodes-served", type=int, default=2)
+parser.add_argument("--no-render", action="store_true")
+parser.add_argument("--port", type=int, default=12345)
+
+
+def kill_proc(proc):
+    try:
+        proc.terminate()  # Send SIGTERM
+        proc.wait(timeout=5)  # Wait for process to terminate
+    except subprocess.TimeoutExpired:
+        proc.kill()  # Send SIGKILL
+        proc.wait()  # Ensure process is dead
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Config for the served RLlib RLModule/Algorithm.
+    base_config = PPOConfig().environment("CartPole-v1")
+
+    results = run_rllib_example_script_experiment(base_config, args)
+    algo_checkpoint = results.get_best_result(
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    ).checkpoint.path
+    # We only need the RLModule component from the algorithm checkpoint. It's located
+    # under "[algo checkpoint dir]/learner_group/learner/rl_module/[default policy ID]
+    rl_module_checkpoint = (
+        Path(algo_checkpoint)
+        / COMPONENT_LEARNER_GROUP
+        / COMPONENT_LEARNER
+        / COMPONENT_RL_MODULE
+        / DEFAULT_MODULE_ID
+    )
+
+    path_of_this_file = Path(__file__).parent
+    os.chdir(path_of_this_file)
+    # Start the serve app with the trained checkpoint.
+    serve_proc = subprocess.Popen(
+        [
+            "serve",
+            "run",
+            "classes.cartpole_deployment:rl_module",
+            f"rl_module_checkpoint={rl_module_checkpoint}",
+            f"port={args.port}",
+            "route_prefix=/rllib-rlmodule",
+        ]
+    )
+    # Register our `kill_proc` function to be called on exit to stop Ray Serve again.
+    atexit.register(kill_proc, serve_proc)
+    # Wait a while to make sure the app is ready to serve.
+    time.sleep(20)
+    print(f"Started Ray Serve with PID: {serve_proc.pid}")
+
+    try:
+        # Create the environment that we would like to receive
+        # served actions for.
+        env = gym.make("CartPole-v1", render_mode="human")
+        obs, _ = env.reset()
+
+        num_episodes = 0
+        episode_return = 0.0
+
+        while num_episodes < args.num_episodes_served:
+            # Render env if necessary.
+            if not args.no_render:
+                env.render()
+
+            # print(f"-> Requesting action for obs={obs} ...", end="")
+            # Send a request to serve.
+            resp = requests.get(
+                f"http://localhost:{args.port}/rllib-rlmodule",
+                json={"observation": obs.tolist()},
+            )
+            response = resp.json()
+            # print(f" received: action={response['action']}")
+
+            # Apply the action in the env.
+            action = response["action"]
+            obs, reward, terminated, truncated, _ = env.step(action)
+            episode_return += reward
+
+            # If episode done -> reset to get initial observation of new episode.
+            if terminated or truncated:
+                print(f"Episode R={episode_return}")
+                obs, _ = env.reset()
+                num_episodes += 1
+                episode_return = 0.0
+
+    finally:
+        # Make sure to kill the process on script termination
+        kill_proc(serve_proc)