diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b438bd320d0a8f070bf7d15872f769629b15261
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/attention_net_supervised.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/attention_net_supervised.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48260b4c6158dc7b249898c973dc099f52f09af4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/attention_net_supervised.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aefc2eec8fba96a6097ef3fe195d62db4b8f2f99
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole_embeddings_learnt_by_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole_embeddings_learnt_by_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..360409a994ecd5bf5043b606a655f3c0ea9edf63
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole_embeddings_learnt_by_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/attention_net_supervised.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/attention_net_supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c0f13f506aa5f5a04d31a323df6b6213c3a83e1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/attention_net_supervised.py
@@ -0,0 +1,77 @@
+# @OldAPIStack
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+
+from rllib.models.tf.attention_net import TrXLNet
+from ray.rllib.utils.framework import try_import_tf
+
+tf1, tf, tfv = try_import_tf()
+
+
+def bit_shift_generator(seq_length, shift, batch_size):
+    while True:
+        values = np.array([0.0, 1.0], dtype=np.float32)
+        seq = np.random.choice(values, (batch_size, seq_length, 1))
+        targets = np.squeeze(np.roll(seq, shift, axis=1).astype(np.int32))
+        targets[:, :shift] = 0
+        yield seq, targets
+
+
+def train_loss(targets, outputs):
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=targets, logits=outputs
+    )
+    return tf.reduce_mean(loss)
+
+
+def train_bit_shift(seq_length, num_iterations, print_every_n):
+
+    optimizer = tf.keras.optimizers.Adam(1e-3)
+
+    model = TrXLNet(
+        observation_space=Box(low=0, high=1, shape=(1,), dtype=np.int32),
+        action_space=Discrete(2),
+        num_outputs=2,
+        model_config={"max_seq_len": seq_length},
+        name="trxl",
+        num_transformer_units=1,
+        attention_dim=10,
+        num_heads=5,
+        head_dim=20,
+        position_wise_mlp_dim=20,
+    )
+
+    shift = 10
+    train_batch = 10
+    test_batch = 100
+    data_gen = bit_shift_generator(seq_length, shift=shift, batch_size=train_batch)
+    test_gen = bit_shift_generator(seq_length, shift=shift, batch_size=test_batch)
+
+    @tf.function
+    def update_step(inputs, targets):
+        model_out = model(
+            {"obs": inputs},
+            state=[tf.reshape(inputs, [-1, seq_length, 1])],
+            seq_lens=np.full(shape=(train_batch,), fill_value=seq_length),
+        )
+        optimizer.minimize(
+            lambda: train_loss(targets, model_out), lambda: model.trainable_variables
+        )
+
+    for i, (inputs, targets) in zip(range(num_iterations), data_gen):
+        inputs_in = np.reshape(inputs, [-1, 1])
+        targets_in = np.reshape(targets, [-1])
+        update_step(tf.convert_to_tensor(inputs_in), tf.convert_to_tensor(targets_in))
+
+        if i % print_every_n == 0:
+            test_inputs, test_targets = next(test_gen)
+            print(i, train_loss(test_targets, model(test_inputs)))
+
+
+if __name__ == "__main__":
+    tf.enable_eager_execution()
+    train_bit_shift(
+        seq_length=20,
+        num_iterations=2000,
+        print_every_n=200,
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59950126ca06e56a2a63d3ce95b8dd5e8b6abc03
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/action_mask_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/action_mask_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa029edb18b6df56671a8b30785523c3a1684026
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/action_mask_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_dist.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_dist.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccec20ec142064b3dc4c79e15769e705ffa5f835
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_dist.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbeddd4ac780e3a5504e1dff315125e4e94eda10
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/centralized_critic_models.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/centralized_critic_models.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdd082e4d30dfd3f20fef75a3814e40f2f899b53
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/centralized_critic_models.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/custom_loss_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/custom_loss_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0fdc2d3fd872ee292b4400bdf27cd5fb237e853
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/custom_loss_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/fast_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/fast_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e0fbcc16b181879e9b52f308cc1a00db6975ed1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/fast_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_encoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_encoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d154e78d6b69a77212bc5fadb8c061dbf2a5550
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_encoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_with_lstm_models.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_with_lstm_models.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97a4ac655037036643ffda7249c792a775c0034e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_with_lstm_models.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/parametric_actions_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/parametric_actions_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66e7a1d9783dec67869e833f80b29b23f5c16ae9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/parametric_actions_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/shared_weights_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/shared_weights_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bae081f58355bf3a6ad4eec059ef463b49e87e7c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/shared_weights_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/simple_rpg_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/simple_rpg_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f4d783014f4a0982aeabe00f44dc5e0139301e5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/simple_rpg_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/action_mask_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/action_mask_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..92fe99e53847ef5f801cc94dc7e30a2f28838ffc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/action_mask_model.py
@@ -0,0 +1,126 @@
+# @OldAPIStack
+from gymnasium.spaces import Dict
+
+from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.torch_utils import FLOAT_MIN
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class ActionMaskModel(TFModelV2):
+    """Model that handles simple discrete action masking.
+
+    This assumes the outputs are logits for a single Categorical action dist.
+    Getting this to work with a more complex output (e.g., if the action space
+    is a tuple of several distributions) is also possible but left as an
+    exercise to the reader.
+    """
+
+    def __init__(
+        self, obs_space, action_space, num_outputs, model_config, name, **kwargs
+    ):
+
+        orig_space = getattr(obs_space, "original_space", obs_space)
+        assert (
+            isinstance(orig_space, Dict)
+            and "action_mask" in orig_space.spaces
+            and "observations" in orig_space.spaces
+        )
+
+        super().__init__(obs_space, action_space, num_outputs, model_config, name)
+
+        self.internal_model = FullyConnectedNetwork(
+            orig_space["observations"],
+            action_space,
+            num_outputs,
+            model_config,
+            name + "_internal",
+        )
+
+        # disable action masking --> will likely lead to invalid actions
+        self.no_masking = model_config["custom_model_config"].get("no_masking", False)
+
+    def forward(self, input_dict, state, seq_lens):
+        # Extract the available actions tensor from the observation.
+        action_mask = input_dict["obs"]["action_mask"]
+
+        # Compute the unmasked logits.
+        logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]})
+
+        # If action masking is disabled, directly return unmasked logits
+        if self.no_masking:
+            return logits, state
+
+        # Convert action_mask into a [0.0 || -inf]-type mask.
+        inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
+        masked_logits = logits + inf_mask
+
+        # Return masked logits.
+        return masked_logits, state
+
+    def value_function(self):
+        return self.internal_model.value_function()
+
+
+class TorchActionMaskModel(TorchModelV2, nn.Module):
+    """PyTorch version of above ActionMaskingModel."""
+
+    def __init__(
+        self,
+        obs_space,
+        action_space,
+        num_outputs,
+        model_config,
+        name,
+        **kwargs,
+    ):
+        orig_space = getattr(obs_space, "original_space", obs_space)
+        assert (
+            isinstance(orig_space, Dict)
+            and "action_mask" in orig_space.spaces
+            and "observations" in orig_space.spaces
+        )
+
+        TorchModelV2.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name, **kwargs
+        )
+        nn.Module.__init__(self)
+
+        self.internal_model = TorchFC(
+            orig_space["observations"],
+            action_space,
+            num_outputs,
+            model_config,
+            name + "_internal",
+        )
+
+        # disable action masking --> will likely lead to invalid actions
+        self.no_masking = False
+        if "no_masking" in model_config["custom_model_config"]:
+            self.no_masking = model_config["custom_model_config"]["no_masking"]
+
+    def forward(self, input_dict, state, seq_lens):
+        # Extract the available actions tensor from the observation.
+        action_mask = input_dict["obs"]["action_mask"]
+
+        # Compute the unmasked logits.
+        logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]})
+
+        # If action masking is disabled, directly return unmasked logits
+        if self.no_masking:
+            return logits, state
+
+        # Convert action_mask into a [0.0 || -inf]-type mask.
+        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
+        masked_logits = logits + inf_mask
+
+        # Return masked logits.
+        return masked_logits, state
+
+    def value_function(self):
+        return self.internal_model.value_function()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd8f2d53f7789d5a1f670361c10b041123664ab7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py
@@ -0,0 +1,149 @@
+# @OldAPIStack
+from ray.rllib.models.tf.tf_action_dist import Categorical, ActionDistribution
+from ray.rllib.models.torch.torch_action_dist import (
+    TorchCategorical,
+    TorchDistributionWrapper,
+)
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class BinaryAutoregressiveDistribution(ActionDistribution):
+    """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
+
+    def deterministic_sample(self):
+        # First, sample a1.
+        a1_dist = self._a1_distribution()
+        a1 = a1_dist.deterministic_sample()
+
+        # Sample a2 conditioned on a1.
+        a2_dist = self._a2_distribution(a1)
+        a2 = a2_dist.deterministic_sample()
+        self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
+
+        # Return the action tuple.
+        return (a1, a2)
+
+    def sample(self):
+        # First, sample a1.
+        a1_dist = self._a1_distribution()
+        a1 = a1_dist.sample()
+
+        # Sample a2 conditioned on a1.
+        a2_dist = self._a2_distribution(a1)
+        a2 = a2_dist.sample()
+        self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
+
+        # Return the action tuple.
+        return (a1, a2)
+
+    def logp(self, actions):
+        a1, a2 = actions[:, 0], actions[:, 1]
+        a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
+        a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec])
+        return Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2)
+
+    def sampled_action_logp(self):
+        return self._action_logp
+
+    def entropy(self):
+        a1_dist = self._a1_distribution()
+        a2_dist = self._a2_distribution(a1_dist.sample())
+        return a1_dist.entropy() + a2_dist.entropy()
+
+    def kl(self, other):
+        a1_dist = self._a1_distribution()
+        a1_terms = a1_dist.kl(other._a1_distribution())
+
+        a1 = a1_dist.sample()
+        a2_terms = self._a2_distribution(a1).kl(other._a2_distribution(a1))
+        return a1_terms + a2_terms
+
+    def _a1_distribution(self):
+        BATCH = tf.shape(self.inputs)[0]
+        a1_logits, _ = self.model.action_model([self.inputs, tf.zeros((BATCH, 1))])
+        a1_dist = Categorical(a1_logits)
+        return a1_dist
+
+    def _a2_distribution(self, a1):
+        a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
+        _, a2_logits = self.model.action_model([self.inputs, a1_vec])
+        a2_dist = Categorical(a2_logits)
+        return a2_dist
+
+    @staticmethod
+    def required_model_output_shape(action_space, model_config):
+        return 16  # controls model output feature vector size
+
+
+class TorchBinaryAutoregressiveDistribution(TorchDistributionWrapper):
+    """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
+
+    def deterministic_sample(self):
+        # First, sample a1.
+        a1_dist = self._a1_distribution()
+        a1 = a1_dist.deterministic_sample()
+
+        # Sample a2 conditioned on a1.
+        a2_dist = self._a2_distribution(a1)
+        a2 = a2_dist.deterministic_sample()
+        self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
+
+        # Return the action tuple.
+        return (a1, a2)
+
+    def sample(self):
+        # First, sample a1.
+        a1_dist = self._a1_distribution()
+        a1 = a1_dist.sample()
+
+        # Sample a2 conditioned on a1.
+        a2_dist = self._a2_distribution(a1)
+        a2 = a2_dist.sample()
+        self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
+
+        # Return the action tuple.
+        return (a1, a2)
+
+    def logp(self, actions):
+        a1, a2 = actions[:, 0], actions[:, 1]
+        a1_vec = torch.unsqueeze(a1.float(), 1)
+        a1_logits, a2_logits = self.model.action_module(self.inputs, a1_vec)
+        return TorchCategorical(a1_logits).logp(a1) + TorchCategorical(a2_logits).logp(
+            a2
+        )
+
+    def sampled_action_logp(self):
+        return self._action_logp
+
+    def entropy(self):
+        a1_dist = self._a1_distribution()
+        a2_dist = self._a2_distribution(a1_dist.sample())
+        return a1_dist.entropy() + a2_dist.entropy()
+
+    def kl(self, other):
+        a1_dist = self._a1_distribution()
+        a1_terms = a1_dist.kl(other._a1_distribution())
+
+        a1 = a1_dist.sample()
+        a2_terms = self._a2_distribution(a1).kl(other._a2_distribution(a1))
+        return a1_terms + a2_terms
+
+    def _a1_distribution(self):
+        BATCH = self.inputs.shape[0]
+        zeros = torch.zeros((BATCH, 1)).to(self.inputs.device)
+        a1_logits, _ = self.model.action_module(self.inputs, zeros)
+        a1_dist = TorchCategorical(a1_logits)
+        return a1_dist
+
+    def _a2_distribution(self, a1):
+        a1_vec = torch.unsqueeze(a1.float(), 1)
+        _, a2_logits = self.model.action_module(self.inputs, a1_vec)
+        a2_dist = TorchCategorical(a2_logits)
+        return a2_dist
+
+    @staticmethod
+    def required_model_output_shape(action_space, model_config):
+        return 16  # controls model output feature vector size
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b71e5ab9dc28bbb9bd5f39767350ab51639ea27
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_model.py
@@ -0,0 +1,162 @@
+# @OldAPIStack
+from gymnasium.spaces import Discrete, Tuple
+
+from ray.rllib.models.tf.misc import normc_initializer
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.torch.misc import normc_initializer as normc_init_torch
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class AutoregressiveActionModel(TFModelV2):
+    """Implements the `.action_model` branch required above."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super(AutoregressiveActionModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+        if action_space != Tuple([Discrete(2), Discrete(2)]):
+            raise ValueError("This model only supports the [2, 2] action space")
+
+        # Inputs
+        obs_input = tf.keras.layers.Input(shape=obs_space.shape, name="obs_input")
+        a1_input = tf.keras.layers.Input(shape=(1,), name="a1_input")
+        ctx_input = tf.keras.layers.Input(shape=(num_outputs,), name="ctx_input")
+
+        # Output of the model (normally 'logits', but for an autoregressive
+        # dist this is more like a context/feature layer encoding the obs)
+        context = tf.keras.layers.Dense(
+            num_outputs,
+            name="hidden",
+            activation=tf.nn.tanh,
+            kernel_initializer=normc_initializer(1.0),
+        )(obs_input)
+
+        # V(s)
+        value_out = tf.keras.layers.Dense(
+            1,
+            name="value_out",
+            activation=None,
+            kernel_initializer=normc_initializer(0.01),
+        )(context)
+
+        # P(a1 | obs)
+        a1_logits = tf.keras.layers.Dense(
+            2,
+            name="a1_logits",
+            activation=None,
+            kernel_initializer=normc_initializer(0.01),
+        )(ctx_input)
+
+        # P(a2 | a1)
+        # --note: typically you'd want to implement P(a2 | a1, obs) as follows:
+        # a2_context = tf.keras.layers.Concatenate(axis=1)(
+        #     [ctx_input, a1_input])
+        a2_context = a1_input
+        a2_hidden = tf.keras.layers.Dense(
+            16,
+            name="a2_hidden",
+            activation=tf.nn.tanh,
+            kernel_initializer=normc_initializer(1.0),
+        )(a2_context)
+        a2_logits = tf.keras.layers.Dense(
+            2,
+            name="a2_logits",
+            activation=None,
+            kernel_initializer=normc_initializer(0.01),
+        )(a2_hidden)
+
+        # Base layers
+        self.base_model = tf.keras.Model(obs_input, [context, value_out])
+        self.base_model.summary()
+
+        # Autoregressive action sampler
+        self.action_model = tf.keras.Model(
+            [ctx_input, a1_input], [a1_logits, a2_logits]
+        )
+        self.action_model.summary()
+
+    def forward(self, input_dict, state, seq_lens):
+        context, self._value_out = self.base_model(input_dict["obs"])
+        return context, state
+
+    def value_function(self):
+        return tf.reshape(self._value_out, [-1])
+
+
+class TorchAutoregressiveActionModel(TorchModelV2, nn.Module):
+    """PyTorch version of the AutoregressiveActionModel above."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        TorchModelV2.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        if action_space != Tuple([Discrete(2), Discrete(2)]):
+            raise ValueError("This model only supports the [2, 2] action space")
+
+        # Output of the model (normally 'logits', but for an autoregressive
+        # dist this is more like a context/feature layer encoding the obs)
+        self.context_layer = SlimFC(
+            in_size=obs_space.shape[0],
+            out_size=num_outputs,
+            initializer=normc_init_torch(1.0),
+            activation_fn=nn.Tanh,
+        )
+
+        # V(s)
+        self.value_branch = SlimFC(
+            in_size=num_outputs,
+            out_size=1,
+            initializer=normc_init_torch(0.01),
+            activation_fn=None,
+        )
+
+        # P(a1 | obs)
+        self.a1_logits = SlimFC(
+            in_size=num_outputs,
+            out_size=2,
+            activation_fn=None,
+            initializer=normc_init_torch(0.01),
+        )
+
+        class _ActionModel(nn.Module):
+            def __init__(self):
+                nn.Module.__init__(self)
+                self.a2_hidden = SlimFC(
+                    in_size=1,
+                    out_size=16,
+                    activation_fn=nn.Tanh,
+                    initializer=normc_init_torch(1.0),
+                )
+                self.a2_logits = SlimFC(
+                    in_size=16,
+                    out_size=2,
+                    activation_fn=None,
+                    initializer=normc_init_torch(0.01),
+                )
+
+            def forward(self_, ctx_input, a1_input):
+                a1_logits = self.a1_logits(ctx_input)
+                a2_logits = self_.a2_logits(self_.a2_hidden(a1_input))
+                return a1_logits, a2_logits
+
+        # P(a2 | a1)
+        # --note: typically you'd want to implement P(a2 | a1, obs) as follows:
+        # a2_context = tf.keras.layers.Concatenate(axis=1)(
+        #     [ctx_input, a1_input])
+        self.action_module = _ActionModel()
+
+        self._context = None
+
+    def forward(self, input_dict, state, seq_lens):
+        self._context = self.context_layer(input_dict["obs"])
+        return self._context, state
+
+    def value_function(self):
+        return torch.reshape(self.value_branch(self._context), [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/centralized_critic_models.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/centralized_critic_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ccc4448e5428f466ea79d858a9e5cc835dc7959
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/centralized_critic_models.py
@@ -0,0 +1,182 @@
+# @OldAPIStack
+from gymnasium.spaces import Box
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class CentralizedCriticModel(TFModelV2):
+    """Multi-agent model that implements a centralized value function."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super(CentralizedCriticModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+        # Base of the model
+        self.model = FullyConnectedNetwork(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+
+        # Central VF maps (obs, opp_obs, opp_act) -> vf_pred
+        obs = tf.keras.layers.Input(shape=(6,), name="obs")
+        opp_obs = tf.keras.layers.Input(shape=(6,), name="opp_obs")
+        opp_act = tf.keras.layers.Input(shape=(2,), name="opp_act")
+        concat_obs = tf.keras.layers.Concatenate(axis=1)([obs, opp_obs, opp_act])
+        central_vf_dense = tf.keras.layers.Dense(
+            16, activation=tf.nn.tanh, name="c_vf_dense"
+        )(concat_obs)
+        central_vf_out = tf.keras.layers.Dense(1, activation=None, name="c_vf_out")(
+            central_vf_dense
+        )
+        self.central_vf = tf.keras.Model(
+            inputs=[obs, opp_obs, opp_act], outputs=central_vf_out
+        )
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        return self.model.forward(input_dict, state, seq_lens)
+
+    def central_value_function(self, obs, opponent_obs, opponent_actions):
+        return tf.reshape(
+            self.central_vf(
+                [obs, opponent_obs, tf.one_hot(tf.cast(opponent_actions, tf.int32), 2)]
+            ),
+            [-1],
+        )
+
+    @override(ModelV2)
+    def value_function(self):
+        return self.model.value_function()  # not used
+
+
+class YetAnotherCentralizedCriticModel(TFModelV2):
+    """Multi-agent model that implements a centralized value function.
+
+    It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the
+    former of which can be used for computing actions (i.e., decentralized
+    execution), and the latter for optimization (i.e., centralized learning).
+
+    This model has two parts:
+    - An action model that looks at just 'own_obs' to compute actions
+    - A value model that also looks at the 'opponent_obs' / 'opponent_action'
+      to compute the value (it does this by using the 'obs_flat' tensor).
+    """
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super(YetAnotherCentralizedCriticModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+
+        self.action_model = FullyConnectedNetwork(
+            Box(low=0, high=1, shape=(6,)),  # one-hot encoded Discrete(6)
+            action_space,
+            num_outputs,
+            model_config,
+            name + "_action",
+        )
+
+        self.value_model = FullyConnectedNetwork(
+            obs_space, action_space, 1, model_config, name + "_vf"
+        )
+
+    def forward(self, input_dict, state, seq_lens):
+        self._value_out, _ = self.value_model(
+            {"obs": input_dict["obs_flat"]}, state, seq_lens
+        )
+        return self.action_model({"obs": input_dict["obs"]["own_obs"]}, state, seq_lens)
+
+    def value_function(self):
+        return tf.reshape(self._value_out, [-1])
+
+
+class TorchCentralizedCriticModel(TorchModelV2, nn.Module):
+    """Multi-agent model that implements a centralized VF."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        TorchModelV2.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        # Base of the model
+        self.model = TorchFC(obs_space, action_space, num_outputs, model_config, name)
+
+        # Central VF maps (obs, opp_obs, opp_act) -> vf_pred
+        input_size = 6 + 6 + 2  # obs + opp_obs + opp_act
+        self.central_vf = nn.Sequential(
+            SlimFC(input_size, 16, activation_fn=nn.Tanh),
+            SlimFC(16, 1),
+        )
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        model_out, _ = self.model(input_dict, state, seq_lens)
+        return model_out, []
+
+    def central_value_function(self, obs, opponent_obs, opponent_actions):
+        input_ = torch.cat(
+            [
+                obs,
+                opponent_obs,
+                torch.nn.functional.one_hot(opponent_actions.long(), 2).float(),
+            ],
+            1,
+        )
+        return torch.reshape(self.central_vf(input_), [-1])
+
+    @override(ModelV2)
+    def value_function(self):
+        return self.model.value_function()  # not used
+
+
+class YetAnotherTorchCentralizedCriticModel(TorchModelV2, nn.Module):
+    """Multi-agent model that implements a centralized value function.
+
+    It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the
+    former of which can be used for computing actions (i.e., decentralized
+    execution), and the latter for optimization (i.e., centralized learning).
+
+    This model has two parts:
+    - An action model that looks at just 'own_obs' to compute actions
+    - A value model that also looks at the 'opponent_obs' / 'opponent_action'
+      to compute the value (it does this by using the 'obs_flat' tensor).
+    """
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        TorchModelV2.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        self.action_model = TorchFC(
+            Box(low=0, high=1, shape=(6,)),  # one-hot encoded Discrete(6)
+            action_space,
+            num_outputs,
+            model_config,
+            name + "_action",
+        )
+
+        self.value_model = TorchFC(
+            obs_space, action_space, 1, model_config, name + "_vf"
+        )
+        self._model_in = None
+
+    def forward(self, input_dict, state, seq_lens):
+        # Store model-input for possible `value_function()` call.
+        self._model_in = [input_dict["obs_flat"], state, seq_lens]
+        return self.action_model({"obs": input_dict["obs"]["own_obs"]}, state, seq_lens)
+
+    def value_function(self):
+        value_out, _ = self.value_model(
+            {"obs": self._model_in[0]}, self._model_in[1], self._model_in[2]
+        )
+        return torch.reshape(value_out, [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/custom_loss_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/custom_loss_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3636c0c6520d25c6f27e46bd1ad56add30500b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/custom_loss_model.py
@@ -0,0 +1,137 @@
+import numpy as np
+
+from ray.rllib.models.modelv2 import ModelV2, restore_original_dimensions
+from ray.rllib.models.tf.tf_action_dist import Categorical
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.offline import JsonReader
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class CustomLossModel(TFModelV2):
+    """Custom model that adds an imitation loss on top of the policy loss."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super().__init__(obs_space, action_space, num_outputs, model_config, name)
+
+        self.fcnet = FullyConnectedNetwork(
+            self.obs_space, self.action_space, num_outputs, model_config, name="fcnet"
+        )
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        # Delegate to our FCNet.
+        return self.fcnet(input_dict, state, seq_lens)
+
+    @override(ModelV2)
+    def value_function(self):
+        # Delegate to our FCNet.
+        return self.fcnet.value_function()
+
+    @override(ModelV2)
+    def custom_loss(self, policy_loss, loss_inputs):
+        # Create a new input reader per worker.
+        reader = JsonReader(self.model_config["custom_model_config"]["input_files"])
+        input_ops = reader.tf_input_ops()
+
+        # Define a secondary loss by building a graph copy with weight sharing.
+        obs = restore_original_dimensions(
+            tf.cast(input_ops["obs"], tf.float32), self.obs_space
+        )
+        logits, _ = self.forward({"obs": obs}, [], None)
+
+        # Compute the IL loss.
+        action_dist = Categorical(logits, self.model_config)
+        self.policy_loss = policy_loss
+        self.imitation_loss = tf.reduce_mean(-action_dist.logp(input_ops["actions"]))
+        return policy_loss + 10 * self.imitation_loss
+
+    def metrics(self):
+        return {
+            "policy_loss": self.policy_loss,
+            "imitation_loss": self.imitation_loss,
+        }
+
+
+class TorchCustomLossModel(TorchModelV2, nn.Module):
+    """PyTorch version of the CustomLossModel above."""
+
+    def __init__(
+        self, obs_space, action_space, num_outputs, model_config, name, input_files
+    ):
+        super().__init__(obs_space, action_space, num_outputs, model_config, name)
+        nn.Module.__init__(self)
+
+        self.input_files = input_files
+        # Create a new input reader per worker.
+        self.reader = JsonReader(self.input_files)
+        self.fcnet = TorchFC(
+            self.obs_space, self.action_space, num_outputs, model_config, name="fcnet"
+        )
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        # Delegate to our FCNet.
+        return self.fcnet(input_dict, state, seq_lens)
+
+    @override(ModelV2)
+    def value_function(self):
+        # Delegate to our FCNet.
+        return self.fcnet.value_function()
+
+    @override(ModelV2)
+    def custom_loss(self, policy_loss, loss_inputs):
+        """Calculates a custom loss on top of the given policy_loss(es).
+
+        Args:
+            policy_loss (List[TensorType]): The list of already calculated
+                policy losses (as many as there are optimizers).
+            loss_inputs: Struct of np.ndarrays holding the
+                entire train batch.
+
+        Returns:
+            List[TensorType]: The altered list of policy losses. In case the
+                custom loss should have its own optimizer, make sure the
+                returned list is one larger than the incoming policy_loss list.
+                In case you simply want to mix in the custom loss into the
+                already calculated policy losses, return a list of altered
+                policy losses (as done in this example below).
+        """
+        # Get the next batch from our input files.
+        batch = self.reader.next()
+
+        # Define a secondary loss by building a graph copy with weight sharing.
+        obs = restore_original_dimensions(
+            torch.from_numpy(batch["obs"]).float().to(policy_loss[0].device),
+            self.obs_space,
+            tensorlib="torch",
+        )
+        logits, _ = self.forward({"obs": obs}, [], None)
+
+        # Compute the IL loss.
+        action_dist = TorchCategorical(logits, self.model_config)
+        imitation_loss = torch.mean(
+            -action_dist.logp(
+                torch.from_numpy(batch["actions"]).to(policy_loss[0].device)
+            )
+        )
+        self.imitation_loss_metric = imitation_loss.item()
+        self.policy_loss_metric = np.mean([loss.item() for loss in policy_loss])
+
+        # Add the imitation loss to each already calculated policy loss term.
+        # Alternatively (if custom loss has its own optimizer):
+        # return policy_loss + [10 * self.imitation_loss]
+        return [loss_ + 10 * imitation_loss for loss_ in policy_loss]
+
+    def metrics(self):
+        return {
+            "policy_loss": self.policy_loss_metric,
+            "imitation_loss": self.imitation_loss_metric,
+        }
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/fast_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/fast_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..99ac7e83a7de342b24aeca1949a77730598c38e9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/fast_model.py
@@ -0,0 +1,80 @@
+# @OldAPIStack
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class FastModel(TFModelV2):
+    """An example for a non-Keras ModelV2 in tf that learns a single weight.
+
+    Defines all network architecture in `forward` (not `__init__` as it's
+    usually done for Keras-style TFModelV2s).
+    """
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super().__init__(obs_space, action_space, num_outputs, model_config, name)
+        # Have we registered our vars yet (see `forward`)?
+        self._registered = False
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        with tf1.variable_scope("model", reuse=tf1.AUTO_REUSE):
+            bias = tf1.get_variable(
+                dtype=tf.float32,
+                name="bias",
+                initializer=tf.keras.initializers.Zeros(),
+                shape=(),
+            )
+            output = bias + tf.zeros([tf.shape(input_dict["obs"])[0], self.num_outputs])
+            self._value_out = tf.reduce_mean(output, -1)  # fake value
+
+        if not self._registered:
+            self.register_variables(
+                tf1.get_collection(
+                    tf1.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"
+                )
+            )
+            self._registered = True
+
+        return output, []
+
+    @override(ModelV2)
+    def value_function(self):
+        return tf.reshape(self._value_out, [-1])
+
+
+class TorchFastModel(TorchModelV2, nn.Module):
+    """Torch version of FastModel (tf)."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        TorchModelV2.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        self.bias = nn.Parameter(
+            torch.tensor([0.0], dtype=torch.float32, requires_grad=True)
+        )
+
+        # Only needed to give some params to the optimizer (even though,
+        # they are never used anywhere).
+        self.dummy_layer = SlimFC(1, 1)
+        self._output = None
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        self._output = self.bias + torch.zeros(
+            size=(input_dict["obs"].shape[0], self.num_outputs)
+        ).to(self.bias.device)
+        return self._output, []
+
+    @override(ModelV2)
+    def value_function(self):
+        assert self._output is not None, "must call forward first!"
+        return torch.reshape(torch.mean(self._output, -1), [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..34baf73f4ef53d433d3ee439f7b6a41a2560cdc5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py
@@ -0,0 +1,48 @@
+# @OldAPIStack
+"""
+This file implements a MobileNet v2 Encoder.
+It uses MobileNet v2 to encode images into a latent space of 1000 dimensions.
+
+Depending on the experiment, the MobileNet v2 encoder layers can be frozen or
+unfrozen. This is controlled by the `freeze` parameter in the config.
+
+This is an example of how a pre-trained neural network can be used as an encoder
+in RLlib. You can modify this example to accommodate your own encoder network or
+other pre-trained networks.
+"""
+
+from ray.rllib.core.models.base import Encoder, ENCODER_OUT
+from ray.rllib.core.models.configs import ModelConfig
+from ray.rllib.core.models.torch.base import TorchModel
+from ray.rllib.utils.framework import try_import_torch
+
+torch, nn = try_import_torch()
+
+MOBILENET_INPUT_SHAPE = (3, 224, 224)
+
+
+class MobileNetV2EncoderConfig(ModelConfig):
+    # MobileNet v2 has a flat output with a length of 1000.
+    output_dims = (1000,)
+    freeze = True
+
+    def build(self, framework):
+        assert framework == "torch", "Unsupported framework `{}`!".format(framework)
+        return MobileNetV2Encoder(self)
+
+
+class MobileNetV2Encoder(TorchModel, Encoder):
+    """A MobileNet v2 encoder for RLlib."""
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.net = torch.hub.load(
+            "pytorch/vision:v0.6.0", "mobilenet_v2", pretrained=True
+        )
+        if config.freeze:
+            # We don't want to train this encoder, so freeze its parameters!
+            for p in self.net.parameters():
+                p.requires_grad = False
+
+    def _forward(self, input_dict, **kwargs):
+        return {ENCODER_OUT: (self.net(input_dict["obs"]))}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0b310c4ed05a45d7aed65d695c474055df82f5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py
@@ -0,0 +1,160 @@
+# @OldAPIStack
+import numpy as np
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.models.torch.recurrent_net import RecurrentNetwork as TorchRNN
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class MobileV2PlusRNNModel(RecurrentNetwork):
+    """A conv. + recurrent keras net example using a pre-trained MobileNet."""
+
+    def __init__(
+        self, obs_space, action_space, num_outputs, model_config, name, cnn_shape
+    ):
+
+        super(MobileV2PlusRNNModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+
+        self.cell_size = 16
+        visual_size = cnn_shape[0] * cnn_shape[1] * cnn_shape[2]
+
+        state_in_h = tf.keras.layers.Input(shape=(self.cell_size,), name="h")
+        state_in_c = tf.keras.layers.Input(shape=(self.cell_size,), name="c")
+        seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)
+
+        inputs = tf.keras.layers.Input(shape=(None, visual_size), name="visual_inputs")
+
+        input_visual = inputs
+        input_visual = tf.reshape(
+            input_visual, [-1, cnn_shape[0], cnn_shape[1], cnn_shape[2]]
+        )
+        cnn_input = tf.keras.layers.Input(shape=cnn_shape, name="cnn_input")
+
+        cnn_model = tf.keras.applications.mobilenet_v2.MobileNetV2(
+            alpha=1.0,
+            include_top=True,
+            weights=None,
+            input_tensor=cnn_input,
+            pooling=None,
+        )
+        vision_out = cnn_model(input_visual)
+        vision_out = tf.reshape(
+            vision_out, [-1, tf.shape(inputs)[1], vision_out.shape.as_list()[-1]]
+        )
+
+        lstm_out, state_h, state_c = tf.keras.layers.LSTM(
+            self.cell_size, return_sequences=True, return_state=True, name="lstm"
+        )(
+            inputs=vision_out,
+            mask=tf.sequence_mask(seq_in),
+            initial_state=[state_in_h, state_in_c],
+        )
+
+        # Postprocess LSTM output with another hidden layer and compute values.
+        logits = tf.keras.layers.Dense(
+            self.num_outputs, activation=tf.keras.activations.linear, name="logits"
+        )(lstm_out)
+        values = tf.keras.layers.Dense(1, activation=None, name="values")(lstm_out)
+
+        # Create the RNN model
+        self.rnn_model = tf.keras.Model(
+            inputs=[inputs, seq_in, state_in_h, state_in_c],
+            outputs=[logits, values, state_h, state_c],
+        )
+        self.rnn_model.summary()
+
+    @override(RecurrentNetwork)
+    def forward_rnn(self, inputs, state, seq_lens):
+        model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] + state)
+        return model_out, [h, c]
+
+    @override(ModelV2)
+    def get_initial_state(self):
+        return [
+            np.zeros(self.cell_size, np.float32),
+            np.zeros(self.cell_size, np.float32),
+        ]
+
+    @override(ModelV2)
+    def value_function(self):
+        return tf.reshape(self._value_out, [-1])
+
+
+class TorchMobileV2PlusRNNModel(TorchRNN, nn.Module):
+    """A conv. + recurrent torch net example using a pre-trained MobileNet."""
+
+    def __init__(
+        self, obs_space, action_space, num_outputs, model_config, name, cnn_shape
+    ):
+
+        TorchRNN.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        self.lstm_state_size = 16
+        self.cnn_shape = list(cnn_shape)
+        self.visual_size_in = cnn_shape[0] * cnn_shape[1] * cnn_shape[2]
+        # MobileNetV2 has a flat output of (1000,).
+        self.visual_size_out = 1000
+
+        # Load the MobileNetV2 from torch.hub.
+        self.cnn_model = torch.hub.load(
+            "pytorch/vision:v0.6.0", "mobilenet_v2", pretrained=True
+        )
+
+        self.lstm = nn.LSTM(
+            self.visual_size_out, self.lstm_state_size, batch_first=True
+        )
+
+        # Postprocess LSTM output with another hidden layer and compute values.
+        self.logits = SlimFC(self.lstm_state_size, self.num_outputs)
+        self.value_branch = SlimFC(self.lstm_state_size, 1)
+        # Holds the current "base" output (before logits layer).
+        self._features = None
+
+    @override(TorchRNN)
+    def forward_rnn(self, inputs, state, seq_lens):
+        # Create image dims.
+        vision_in = torch.reshape(inputs, [-1] + self.cnn_shape)
+        vision_out = self.cnn_model(vision_in)
+        # Flatten.
+        vision_out_time_ranked = torch.reshape(
+            vision_out, [inputs.shape[0], inputs.shape[1], vision_out.shape[-1]]
+        )
+        if len(state[0].shape) == 2:
+            state[0] = state[0].unsqueeze(0)
+            state[1] = state[1].unsqueeze(0)
+        # Forward through LSTM.
+        self._features, [h, c] = self.lstm(vision_out_time_ranked, state)
+        # Forward LSTM out through logits layer and value layer.
+        logits = self.logits(self._features)
+        return logits, [h.squeeze(0), c.squeeze(0)]
+
+    @override(ModelV2)
+    def get_initial_state(self):
+        # Place hidden states on same device as model.
+        h = [
+            list(self.cnn_model.modules())[-1]
+            .weight.new(1, self.lstm_state_size)
+            .zero_()
+            .squeeze(0),
+            list(self.cnn_model.modules())[-1]
+            .weight.new(1, self.lstm_state_size)
+            .zero_()
+            .squeeze(0),
+        ]
+        return h
+
+    @override(ModelV2)
+    def value_function(self):
+        assert self._features is not None, "must call forward() first"
+        return torch.reshape(self.value_branch(self._features), [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/neural_computer.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/neural_computer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d863f71e62d7426c360cffc73042db3c393a7f78
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/neural_computer.py
@@ -0,0 +1,247 @@
+# @OldAPIStack
+from collections import OrderedDict
+import gymnasium as gym
+from typing import Union, Dict, List, Tuple
+
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+
+try:
+    from dnc import DNC
+except ModuleNotFoundError:
+    print("dnc module not found. Did you forget to 'pip install dnc'?")
+    raise
+
+torch, nn = try_import_torch()
+
+
+class DNCMemory(TorchModelV2, nn.Module):
+    """Differentiable Neural Computer wrapper around ixaxaar's DNC implementation,
+    see https://github.com/ixaxaar/pytorch-dnc"""
+
+    DEFAULT_CONFIG = {
+        "dnc_model": DNC,
+        # Number of controller hidden layers
+        "num_hidden_layers": 1,
+        # Number of weights per controller hidden layer
+        "hidden_size": 64,
+        # Number of LSTM units
+        "num_layers": 1,
+        # Number of read heads, i.e. how many addrs are read at once
+        "read_heads": 4,
+        # Number of memory cells in the controller
+        "nr_cells": 32,
+        # Size of each cell
+        "cell_size": 16,
+        # LSTM activation function
+        "nonlinearity": "tanh",
+        # Observation goes through this torch.nn.Module before
+        # feeding to the DNC
+        "preprocessor": torch.nn.Sequential(torch.nn.Linear(64, 64), torch.nn.Tanh()),
+        # Input size to the preprocessor
+        "preprocessor_input_size": 64,
+        # The output size of the preprocessor
+        # and the input size of the dnc
+        "preprocessor_output_size": 64,
+    }
+
+    MEMORY_KEYS = [
+        "memory",
+        "link_matrix",
+        "precedence",
+        "read_weights",
+        "write_weights",
+        "usage_vector",
+    ]
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+        **custom_model_kwargs,
+    ):
+        nn.Module.__init__(self)
+        super(DNCMemory, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+        self.num_outputs = num_outputs
+        self.obs_dim = gym.spaces.utils.flatdim(obs_space)
+        self.act_dim = gym.spaces.utils.flatdim(action_space)
+
+        self.cfg = dict(self.DEFAULT_CONFIG, **custom_model_kwargs)
+        assert (
+            self.cfg["num_layers"] == 1
+        ), "num_layers != 1 has not been implemented yet"
+        self.cur_val = None
+
+        self.preprocessor = torch.nn.Sequential(
+            torch.nn.Linear(self.obs_dim, self.cfg["preprocessor_input_size"]),
+            self.cfg["preprocessor"],
+        )
+
+        self.logit_branch = SlimFC(
+            in_size=self.cfg["hidden_size"],
+            out_size=self.num_outputs,
+            activation_fn=None,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+
+        self.value_branch = SlimFC(
+            in_size=self.cfg["hidden_size"],
+            out_size=1,
+            activation_fn=None,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+
+        self.dnc: Union[None, DNC] = None
+
+    def get_initial_state(self) -> List[TensorType]:
+        ctrl_hidden = [
+            torch.zeros(self.cfg["num_hidden_layers"], self.cfg["hidden_size"]),
+            torch.zeros(self.cfg["num_hidden_layers"], self.cfg["hidden_size"]),
+        ]
+        m = self.cfg["nr_cells"]
+        r = self.cfg["read_heads"]
+        w = self.cfg["cell_size"]
+        memory = [
+            torch.zeros(m, w),  # memory
+            torch.zeros(1, m, m),  # link_matrix
+            torch.zeros(1, m),  # precedence
+            torch.zeros(r, m),  # read_weights
+            torch.zeros(1, m),  # write_weights
+            torch.zeros(m),  # usage_vector
+        ]
+
+        read_vecs = torch.zeros(w * r)
+
+        state = [*ctrl_hidden, read_vecs, *memory]
+        assert len(state) == 9
+        return state
+
+    def value_function(self) -> TensorType:
+        assert self.cur_val is not None, "must call forward() first"
+        return self.cur_val
+
+    def unpack_state(
+        self,
+        state: List[TensorType],
+    ) -> Tuple[List[Tuple[TensorType, TensorType]], Dict[str, TensorType], TensorType]:
+        """Given a list of tensors, reformat for self.dnc input"""
+        assert len(state) == 9, "Failed to verify unpacked state"
+        ctrl_hidden: List[Tuple[TensorType, TensorType]] = [
+            (
+                state[0].permute(1, 0, 2).contiguous(),
+                state[1].permute(1, 0, 2).contiguous(),
+            )
+        ]
+        read_vecs: TensorType = state[2]
+        memory: List[TensorType] = state[3:]
+        memory_dict: OrderedDict[str, TensorType] = OrderedDict(
+            zip(self.MEMORY_KEYS, memory)
+        )
+
+        return ctrl_hidden, memory_dict, read_vecs
+
+    def pack_state(
+        self,
+        ctrl_hidden: List[Tuple[TensorType, TensorType]],
+        memory_dict: Dict[str, TensorType],
+        read_vecs: TensorType,
+    ) -> List[TensorType]:
+        """Given the dnc output, pack it into a list of tensors
+        for rllib state. Order is ctrl_hidden, read_vecs, memory_dict"""
+        state = []
+        ctrl_hidden = [
+            ctrl_hidden[0][0].permute(1, 0, 2),
+            ctrl_hidden[0][1].permute(1, 0, 2),
+        ]
+        state += ctrl_hidden
+        assert len(state) == 2, "Failed to verify packed state"
+        state.append(read_vecs)
+        assert len(state) == 3, "Failed to verify packed state"
+        state += memory_dict.values()
+        assert len(state) == 9, "Failed to verify packed state"
+        return state
+
+    def validate_unpack(self, dnc_output, unpacked_state):
+        """Ensure the unpacked state shapes match the DNC output"""
+        s_ctrl_hidden, s_memory_dict, s_read_vecs = unpacked_state
+        ctrl_hidden, memory_dict, read_vecs = dnc_output
+
+        for i in range(len(ctrl_hidden)):
+            for j in range(len(ctrl_hidden[i])):
+                assert s_ctrl_hidden[i][j].shape == ctrl_hidden[i][j].shape, (
+                    "Controller state mismatch: got "
+                    f"{s_ctrl_hidden[i][j].shape} should be "
+                    f"{ctrl_hidden[i][j].shape}"
+                )
+
+        for k in memory_dict:
+            assert s_memory_dict[k].shape == memory_dict[k].shape, (
+                "Memory state mismatch at key "
+                f"{k}: got {s_memory_dict[k].shape} should be "
+                f"{memory_dict[k].shape}"
+            )
+
+        assert s_read_vecs.shape == read_vecs.shape, (
+            "Read state mismatch: got "
+            f"{s_read_vecs.shape} should be "
+            f"{read_vecs.shape}"
+        )
+
+    def build_dnc(self, device_idx: Union[int, None]) -> None:
+        self.dnc = self.cfg["dnc_model"](
+            input_size=self.cfg["preprocessor_output_size"],
+            hidden_size=self.cfg["hidden_size"],
+            num_layers=self.cfg["num_layers"],
+            num_hidden_layers=self.cfg["num_hidden_layers"],
+            read_heads=self.cfg["read_heads"],
+            cell_size=self.cfg["cell_size"],
+            nr_cells=self.cfg["nr_cells"],
+            nonlinearity=self.cfg["nonlinearity"],
+            gpu_id=device_idx,
+        )
+
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> Tuple[TensorType, List[TensorType]]:
+
+        flat = input_dict["obs_flat"]
+        # Batch and Time
+        # Forward expects outputs as [B, T, logits]
+        B = len(seq_lens)
+        T = flat.shape[0] // B
+
+        # Deconstruct batch into batch and time dimensions: [B, T, feats]
+        flat = torch.reshape(flat, [-1, T] + list(flat.shape[1:]))
+
+        # First run
+        if self.dnc is None:
+            gpu_id = flat.device.index if flat.device.index is not None else -1
+            self.build_dnc(gpu_id)
+            hidden = (None, None, None)
+
+        else:
+            hidden = self.unpack_state(state)  # type: ignore
+
+        # Run thru preprocessor before DNC
+        z = self.preprocessor(flat.reshape(B * T, self.obs_dim))
+        z = z.reshape(B, T, self.cfg["preprocessor_output_size"])
+        output, hidden = self.dnc(z, hidden)
+        packed_state = self.pack_state(*hidden)
+
+        # Compute action/value from output
+        logits = self.logit_branch(output.view(B * T, -1))
+        values = self.value_branch(output.view(B * T, -1))
+
+        self.cur_val = values.squeeze(1)
+
+        return logits, packed_state
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/parametric_actions_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/parametric_actions_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e568b8bed72ad00d740ae9be774fe43e06a44fe5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/parametric_actions_model.py
@@ -0,0 +1,201 @@
+# @OldAPIStack
+from gymnasium.spaces import Box
+
+from ray.rllib.algorithms.dqn.distributional_q_tf_model import DistributionalQTFModel
+from ray.rllib.algorithms.dqn.dqn_torch_model import DQNTorchModel
+from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
+from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.torch_utils import FLOAT_MAX, FLOAT_MIN
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class ParametricActionsModel(DistributionalQTFModel):
+    """Parametric action model that handles the dot product and masking.
+
+    This assumes the outputs are logits for a single Categorical action dist.
+    Getting this to work with a more complex output (e.g., if the action space
+    is a tuple of several distributions) is also possible but left as an
+    exercise to the reader.
+    """
+
+    def __init__(
+        self,
+        obs_space,
+        action_space,
+        num_outputs,
+        model_config,
+        name,
+        true_obs_shape=(4,),
+        action_embed_size=2,
+        **kw
+    ):
+        super(ParametricActionsModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name, **kw
+        )
+        self.action_embed_model = FullyConnectedNetwork(
+            Box(-1, 1, shape=true_obs_shape),
+            action_space,
+            action_embed_size,
+            model_config,
+            name + "_action_embed",
+        )
+
+    def forward(self, input_dict, state, seq_lens):
+        # Extract the available actions tensor from the observation.
+        avail_actions = input_dict["obs"]["avail_actions"]
+        action_mask = input_dict["obs"]["action_mask"]
+
+        # Compute the predicted action embedding
+        action_embed, _ = self.action_embed_model({"obs": input_dict["obs"]["cart"]})
+
+        # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
+        # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
+        intent_vector = tf.expand_dims(action_embed, 1)
+
+        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
+        action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)
+
+        # Mask out invalid actions (use tf.float32.min for stability)
+        inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
+        return action_logits + inf_mask, state
+
+    def value_function(self):
+        return self.action_embed_model.value_function()
+
+
+class TorchParametricActionsModel(DQNTorchModel):
+    """PyTorch version of above ParametricActionsModel."""
+
+    def __init__(
+        self,
+        obs_space,
+        action_space,
+        num_outputs,
+        model_config,
+        name,
+        true_obs_shape=(4,),
+        action_embed_size=2,
+        **kw
+    ):
+        DQNTorchModel.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name, **kw
+        )
+
+        self.action_embed_model = TorchFC(
+            Box(-1, 1, shape=true_obs_shape),
+            action_space,
+            action_embed_size,
+            model_config,
+            name + "_action_embed",
+        )
+
+    def forward(self, input_dict, state, seq_lens):
+        # Extract the available actions tensor from the observation.
+        avail_actions = input_dict["obs"]["avail_actions"]
+        action_mask = input_dict["obs"]["action_mask"]
+
+        # Compute the predicted action embedding
+        action_embed, _ = self.action_embed_model({"obs": input_dict["obs"]["cart"]})
+
+        # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
+        # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
+        intent_vector = torch.unsqueeze(action_embed, 1)
+
+        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
+        action_logits = torch.sum(avail_actions * intent_vector, dim=2)
+
+        # Mask out invalid actions (use -inf to tag invalid).
+        # These are then recognized by the EpsilonGreedy exploration component
+        # as invalid actions that are not to be chosen.
+        inf_mask = torch.clamp(torch.log(action_mask), FLOAT_MIN, FLOAT_MAX)
+
+        return action_logits + inf_mask, state
+
+    def value_function(self):
+        return self.action_embed_model.value_function()
+
+
+class ParametricActionsModelThatLearnsEmbeddings(DistributionalQTFModel):
+    """Same as the above ParametricActionsModel.
+
+    However, this version also learns the action embeddings.
+    """
+
+    def __init__(
+        self,
+        obs_space,
+        action_space,
+        num_outputs,
+        model_config,
+        name,
+        true_obs_shape=(4,),
+        action_embed_size=2,
+        **kw
+    ):
+        super(ParametricActionsModelThatLearnsEmbeddings, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name, **kw
+        )
+
+        action_ids_shifted = tf.constant(
+            list(range(1, num_outputs + 1)), dtype=tf.float32
+        )
+
+        obs_cart = tf.keras.layers.Input(shape=true_obs_shape, name="obs_cart")
+        valid_avail_actions_mask = tf.keras.layers.Input(
+            shape=(num_outputs,), name="valid_avail_actions_mask"
+        )
+
+        self.pred_action_embed_model = FullyConnectedNetwork(
+            Box(-1, 1, shape=true_obs_shape),
+            action_space,
+            action_embed_size,
+            model_config,
+            name + "_pred_action_embed",
+        )
+
+        # Compute the predicted action embedding
+        pred_action_embed, _ = self.pred_action_embed_model({"obs": obs_cart})
+        _value_out = self.pred_action_embed_model.value_function()
+
+        # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
+        # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
+        intent_vector = tf.expand_dims(pred_action_embed, 1)
+
+        valid_avail_actions = action_ids_shifted * valid_avail_actions_mask
+        # Embedding for valid available actions which will be learned.
+        # Embedding vector for 0 is an invalid embedding (a "dummy embedding").
+        valid_avail_actions_embed = tf.keras.layers.Embedding(
+            input_dim=num_outputs + 1,
+            output_dim=action_embed_size,
+            name="action_embed_matrix",
+        )(valid_avail_actions)
+
+        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
+        action_logits = tf.reduce_sum(valid_avail_actions_embed * intent_vector, axis=2)
+
+        # Mask out invalid actions (use tf.float32.min for stability)
+        inf_mask = tf.maximum(tf.math.log(valid_avail_actions_mask), tf.float32.min)
+
+        action_logits = action_logits + inf_mask
+
+        self.param_actions_model = tf.keras.Model(
+            inputs=[obs_cart, valid_avail_actions_mask],
+            outputs=[action_logits, _value_out],
+        )
+        self.param_actions_model.summary()
+
+    def forward(self, input_dict, state, seq_lens):
+        # Extract the available actions mask tensor from the observation.
+        valid_avail_actions_mask = input_dict["obs"]["valid_avail_actions_mask"]
+
+        action_logits, self._value_out = self.param_actions_model(
+            [input_dict["obs"]["cart"], valid_avail_actions_mask]
+        )
+
+        return action_logits, state
+
+    def value_function(self):
+        return self._value_out
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/shared_weights_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/shared_weights_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..478859af9da3aa09b21523302747ba842364a00c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/shared_weights_model.py
@@ -0,0 +1,206 @@
+# @OldAPIStack
+import numpy as np
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+TF2_GLOBAL_SHARED_LAYER = None
+
+
+class TF2SharedWeightsModel(TFModelV2):
+    """Example of weight sharing between two different TFModelV2s.
+
+    NOTE: This will only work for tf2.x. When running with config.framework=tf,
+    use SharedWeightsModel1 and SharedWeightsModel2 below, instead!
+
+    The shared (single) layer is simply defined outside of the two Models,
+    then used by both Models in their forward pass.
+    """
+
+    def __init__(
+        self, observation_space, action_space, num_outputs, model_config, name
+    ):
+        super().__init__(
+            observation_space, action_space, num_outputs, model_config, name
+        )
+
+        global TF2_GLOBAL_SHARED_LAYER
+        # The global, shared layer to be used by both models.
+        if TF2_GLOBAL_SHARED_LAYER is None:
+            TF2_GLOBAL_SHARED_LAYER = tf.keras.layers.Dense(
+                units=64, activation=tf.nn.relu, name="fc1"
+            )
+
+        inputs = tf.keras.layers.Input(observation_space.shape)
+        last_layer = TF2_GLOBAL_SHARED_LAYER(inputs)
+        output = tf.keras.layers.Dense(
+            units=num_outputs, activation=None, name="fc_out"
+        )(last_layer)
+        vf = tf.keras.layers.Dense(units=1, activation=None, name="value_out")(
+            last_layer
+        )
+        self.base_model = tf.keras.models.Model(inputs, [output, vf])
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        out, self._value_out = self.base_model(input_dict["obs"])
+        return out, []
+
+    @override(ModelV2)
+    def value_function(self):
+        return tf.reshape(self._value_out, [-1])
+
+
+class SharedWeightsModel1(TFModelV2):
+    """Example of weight sharing between two different TFModelV2s.
+
+    NOTE: This will only work for tf1 (static graph). When running with
+    config.framework_str=tf2, use TF2SharedWeightsModel, instead!
+
+    Here, we share the variables defined in the 'shared' variable scope
+    by entering it explicitly with tf1.AUTO_REUSE. This creates the
+    variables for the 'fc1' layer in a global scope called 'shared'
+    (outside of the Policy's normal variable scope).
+    """
+
+    def __init__(
+        self, observation_space, action_space, num_outputs, model_config, name
+    ):
+        super().__init__(
+            observation_space, action_space, num_outputs, model_config, name
+        )
+
+        inputs = tf.keras.layers.Input(observation_space.shape)
+        with tf1.variable_scope(
+            tf1.VariableScope(tf1.AUTO_REUSE, "shared"),
+            reuse=tf1.AUTO_REUSE,
+            auxiliary_name_scope=False,
+        ):
+            last_layer = tf.keras.layers.Dense(
+                units=64, activation=tf.nn.relu, name="fc1"
+            )(inputs)
+        output = tf.keras.layers.Dense(
+            units=num_outputs, activation=None, name="fc_out"
+        )(last_layer)
+        vf = tf.keras.layers.Dense(units=1, activation=None, name="value_out")(
+            last_layer
+        )
+        self.base_model = tf.keras.models.Model(inputs, [output, vf])
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        out, self._value_out = self.base_model(input_dict["obs"])
+        return out, []
+
+    @override(ModelV2)
+    def value_function(self):
+        return tf.reshape(self._value_out, [-1])
+
+
+class SharedWeightsModel2(TFModelV2):
+    """The "other" TFModelV2 using the same shared space as the one above."""
+
+    def __init__(
+        self, observation_space, action_space, num_outputs, model_config, name
+    ):
+        super().__init__(
+            observation_space, action_space, num_outputs, model_config, name
+        )
+
+        inputs = tf.keras.layers.Input(observation_space.shape)
+
+        # Weights shared with SharedWeightsModel1.
+        with tf1.variable_scope(
+            tf1.VariableScope(tf1.AUTO_REUSE, "shared"),
+            reuse=tf1.AUTO_REUSE,
+            auxiliary_name_scope=False,
+        ):
+            last_layer = tf.keras.layers.Dense(
+                units=64, activation=tf.nn.relu, name="fc1"
+            )(inputs)
+        output = tf.keras.layers.Dense(
+            units=num_outputs, activation=None, name="fc_out"
+        )(last_layer)
+        vf = tf.keras.layers.Dense(units=1, activation=None, name="value_out")(
+            last_layer
+        )
+        self.base_model = tf.keras.models.Model(inputs, [output, vf])
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        out, self._value_out = self.base_model(input_dict["obs"])
+        return out, []
+
+    @override(ModelV2)
+    def value_function(self):
+        return tf.reshape(self._value_out, [-1])
+
+
+TORCH_GLOBAL_SHARED_LAYER = None
+if torch:
+    # The global, shared layer to be used by both models.
+    TORCH_GLOBAL_SHARED_LAYER = SlimFC(
+        64,
+        64,
+        activation_fn=nn.ReLU,
+        initializer=torch.nn.init.xavier_uniform_,
+    )
+
+
+class TorchSharedWeightsModel(TorchModelV2, nn.Module):
+    """Example of weight sharing between two different TorchModelV2s.
+
+    The shared (single) layer is simply defined outside of the two Models,
+    then used by both Models in their forward pass.
+    """
+
+    def __init__(
+        self, observation_space, action_space, num_outputs, model_config, name
+    ):
+        TorchModelV2.__init__(
+            self, observation_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        # Non-shared initial layer.
+        self.first_layer = SlimFC(
+            int(np.prod(observation_space.shape)),
+            64,
+            activation_fn=nn.ReLU,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+
+        # Non-shared final layer.
+        self.last_layer = SlimFC(
+            64,
+            self.num_outputs,
+            activation_fn=None,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+        self.vf = SlimFC(
+            64,
+            1,
+            activation_fn=None,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+        self._global_shared_layer = TORCH_GLOBAL_SHARED_LAYER
+        self._output = None
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        out = self.first_layer(input_dict["obs"])
+        self._output = self._global_shared_layer(out)
+        model_out = self.last_layer(self._output)
+        return model_out, []
+
+    @override(ModelV2)
+    def value_function(self):
+        assert self._output is not None, "must call forward first!"
+        return torch.reshape(self.vf(self._output), [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/simple_rpg_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/simple_rpg_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b37d915df8a18101435ea9acd59b14f71e39b74d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/simple_rpg_model.py
@@ -0,0 +1,65 @@
+# @OldAPIStack
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as TFFCNet
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class CustomTorchRPGModel(TorchModelV2, nn.Module):
+    """Example of interpreting repeated observations."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super().__init__(obs_space, action_space, num_outputs, model_config, name)
+        nn.Module.__init__(self)
+        self.model = TorchFCNet(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+
+    def forward(self, input_dict, state, seq_lens):
+        # The unpacked input tensors, where M=MAX_PLAYERS, N=MAX_ITEMS:
+        # {
+        #   'items', <torch.Tensor shape=(?, M, N, 5)>,
+        #   'location', <torch.Tensor shape=(?, M, 2)>,
+        #   'status', <torch.Tensor shape=(?, M, 10)>,
+        # }
+        print("The unpacked input tensors:", input_dict["obs"])
+        print()
+        print("Unbatched repeat dim", input_dict["obs"].unbatch_repeat_dim())
+        print()
+        print("Fully unbatched", input_dict["obs"].unbatch_all())
+        print()
+        return self.model.forward(input_dict, state, seq_lens)
+
+    def value_function(self):
+        return self.model.value_function()
+
+
+class CustomTFRPGModel(TFModelV2):
+    """Example of interpreting repeated observations."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super().__init__(obs_space, action_space, num_outputs, model_config, name)
+        self.model = TFFCNet(obs_space, action_space, num_outputs, model_config, name)
+
+    def forward(self, input_dict, state, seq_lens):
+        # The unpacked input tensors, where M=MAX_PLAYERS, N=MAX_ITEMS:
+        # {
+        #   'items', <tf.Tensor shape=(?, M, N, 5)>,
+        #   'location', <tf.Tensor shape=(?, M, 2)>,
+        #   'status', <tf.Tensor shape=(?, M, 10)>,
+        # }
+        print("The unpacked input tensors:", input_dict["obs"])
+        print()
+        print("Unbatched repeat dim", input_dict["obs"].unbatch_repeat_dim())
+        print()
+        if tf.executing_eagerly():
+            print("Fully unbatched", input_dict["obs"].unbatch_all())
+            print()
+        return self.model.forward(input_dict, state, seq_lens)
+
+    def value_function(self):
+        return self.model.value_function()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5df2f821c49aa41f4ddd9988c1acbaaf7a63819
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole.py
@@ -0,0 +1,121 @@
+# @OldAPIStack
+"""Example of handling variable length or parametric action spaces.
+
+This toy example demonstrates the action-embedding based approach for handling large
+discrete action spaces (potentially infinite in size), similar to this example:
+
+    https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/
+
+This example works with RLlib's policy gradient style algorithms
+(e.g., PG, PPO, IMPALA, A2C) and DQN.
+
+Note that since the model outputs now include "-inf" tf.float32.min
+values, not all algorithm options are supported. For example,
+algorithms might crash if they don't properly ignore the -inf action scores.
+Working configurations are given below.
+"""
+
+import argparse
+import os
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.examples.envs.classes.parametric_actions_cartpole import (
+    ParametricActionsCartPole,
+)
+from ray.rllib.examples._old_api_stack.models.parametric_actions_model import (
+    ParametricActionsModel,
+    TorchParametricActionsModel,
+)
+from ray.rllib.models import ModelCatalog
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import check_learning_achieved
+from ray.tune.registry import register_env
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
+)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=200, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
+)
+parser.add_argument(
+    "--stop-reward", type=float, default=150.0, help="Reward at which we stop training."
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    ray.init()
+
+    register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10))
+    ModelCatalog.register_custom_model(
+        "pa_model",
+        TorchParametricActionsModel
+        if args.framework == "torch"
+        else ParametricActionsModel,
+    )
+
+    if args.run == "DQN":
+        cfg = {
+            # TODO(ekl) we need to set these to prevent the masked values
+            # from being further processed in DistributionalQModel, which
+            # would mess up the masking. It is possible to support these if we
+            # defined a custom DistributionalQModel that is aware of masking.
+            "hiddens": [],
+            "dueling": False,
+            "enable_rl_module_and_learner": False,
+            "enable_env_runner_and_connector_v2": False,
+        }
+    else:
+        cfg = {}
+
+    config = dict(
+        {
+            "env": "pa_cartpole",
+            "model": {
+                "custom_model": "pa_model",
+            },
+            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
+            "num_env_runners": 0,
+            "framework": args.framework,
+        },
+        **cfg,
+    )
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+
+    results = tune.Tuner(
+        args.run,
+        run_config=air.RunConfig(stop=stop, verbose=1),
+        param_space=config,
+    ).fit()
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..476e8b81eece613b6e1797b5623581cd3b2343ec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py
@@ -0,0 +1,107 @@
+# @OldAPIStack
+"""Example of handling variable length or parametric action spaces.
+
+This is a toy example of the action-embedding based approach for handling large
+discrete action spaces (potentially infinite in size), similar to this:
+
+    https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/
+
+This currently works with RLlib's policy gradient style algorithms
+(e.g., PG, PPO, IMPALA, A2C) and also DQN.
+
+Note that since the model outputs now include "-inf" tf.float32.min
+values, not all algorithm options are supported at the moment. For example,
+algorithms might crash if they don't properly ignore the -inf action scores.
+Working configurations are given below.
+"""
+
+import argparse
+import os
+
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.examples.envs.classes.parametric_actions_cartpole import (
+    ParametricActionsCartPoleNoEmbeddings,
+)
+from ray.rllib.examples._old_api_stack.models.parametric_actions_model import (
+    ParametricActionsModelThatLearnsEmbeddings,
+)
+from ray.rllib.models import ModelCatalog
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import check_learning_achieved
+from ray.tune.registry import register_env
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--run", type=str, default="PPO")
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2"],
+    default="tf",
+    help="The DL framework specifier (Torch not supported "
+    "due to the lack of a model).",
+)
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
+parser.add_argument("--stop-reward", type=float, default=150.0)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    ray.init()
+
+    register_env("pa_cartpole", lambda _: ParametricActionsCartPoleNoEmbeddings(10))
+
+    ModelCatalog.register_custom_model(
+        "pa_model", ParametricActionsModelThatLearnsEmbeddings
+    )
+
+    if args.run == "DQN":
+        cfg = {
+            # TODO(ekl) we need to set these to prevent the masked values
+            # from being further processed in DistributionalQModel, which
+            # would mess up the masking. It is possible to support these if we
+            # defined a custom DistributionalQModel that is aware of masking.
+            "hiddens": [],
+            "dueling": False,
+            "enable_rl_module_and_learner": False,
+            "enable_env_runner_and_connector_v2": False,
+        }
+    else:
+        cfg = {}
+
+    config = dict(
+        {
+            "env": "pa_cartpole",
+            "model": {
+                "custom_model": "pa_model",
+            },
+            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
+            "num_env_runners": 0,
+            "framework": args.framework,
+            "action_mask_key": "valid_avail_actions_mask",
+        },
+        **cfg,
+    )
+
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+
+    results = tune.Tuner(
+        args.run,
+        run_config=air.RunConfig(stop=stop, verbose=2),
+        param_space=config,
+    ).fit()
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..291a03371a1328cada93cbdb75710bba822b596a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/cartpole_dqn_export.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/cartpole_dqn_export.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a89b1c9d39abccea52b84ba9c1a3d211fa9c92da
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/cartpole_dqn_export.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/change_config_during_training.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/change_config_during_training.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd94db1e38065b507a63d5b4c2e5d28eebd14d76
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/change_config_during_training.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/checkpoint_by_custom_criteria.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/checkpoint_by_custom_criteria.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3d4f2d5a14f5766a8e9805ddb7837a8ad1d9982
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/checkpoint_by_custom_criteria.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/restore_1_of_n_agents_from_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/restore_1_of_n_agents_from_checkpoint.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9aa6b7406eaffea6a07f0698f6d1a06092e19089
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/restore_1_of_n_agents_from_checkpoint.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..090906559f1e8e5591e324a88a3ff922a42960fd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/custom_heuristic_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/custom_heuristic_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f07faee83f8e441480abe0d3116e21a61ee837d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/custom_heuristic_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/different_spaces_for_agents.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/different_spaces_for_agents.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f82828ad7cb44d3e5eb9997fbb5ffda8083b7ab1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/different_spaces_for_agents.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_cartpole.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a32ca68beceb55fa4b15159412cafa250a763917
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_cartpole.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_pendulum.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_pendulum.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..129c658eb740059968d59aa1781894f2dc552eda
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_pendulum.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_independent_learning.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_independent_learning.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ef87cc56c3140521f1e278dd42d9e8dddfba175
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_independent_learning.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_parameter_sharing.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_parameter_sharing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bdfd9087353ce980e0cbbbd5956285df1642601
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_parameter_sharing.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_shared_value_function.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_shared_value_function.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1a8b98be8f176c26c0451ff69eabd928442d62c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_shared_value_function.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_heuristic_vs_learned.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_heuristic_vs_learned.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d70f60301c2fdef9f26acbe4ddf36627bda15768
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_heuristic_vs_learned.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_learned_vs_learned.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_learned_vs_learned.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c583a58d543c15c6c5c29a7ccd3121378299a039
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_learned_vs_learned.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_league_based_with_open_spiel.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_league_based_with_open_spiel.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b404fc48db1893d388051b41b31c538706b00eb8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_league_based_with_open_spiel.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_with_open_spiel.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_with_open_spiel.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89f532e25de900110c41f16b1ff76923c7ac9cae
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_with_open_spiel.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/two_step_game_with_grouped_agents.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/two_step_game_with_grouped_agents.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15e6ccee70d35a11ac78f532b04e9c1fb6988588
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/two_step_game_with_grouped_agents.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/custom_heuristic_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/custom_heuristic_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f5d4c6a067d241c47d79c3b6c4114f142445cfb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/custom_heuristic_policy.py
@@ -0,0 +1,101 @@
+"""Example of running a custom heuristic (hand-coded) policy alongside trainable ones.
+
+This example has two RLModules (as action computing policies):
+    (1) one trained by a PPOLearner
+    (2) one hand-coded policy that acts at random in the env (doesn't learn).
+
+The environment is MultiAgentCartPole, in which there are n agents both policies
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see the PPO policy ("learnable_policy") does much
+better than "random":
+
++-------------------+------------+----------+------+----------------+
+| Trial name        | status     | loc      | iter | total time (s) |
+|                   |            |          |      |                |
+|-------------------+------------+----------+------+----------------+
+| PPO_multi_agen... | TERMINATED | 127. ... |   20 |         58.646 |
++-------------------+------------+----------+------+----------------+
+
++--------+-------------------+-----------------+--------------------+
+|     ts |   combined reward |   reward random |             reward |
+|        |                   |                 |   learnable_policy |
++--------+-------------------+-----------------+--------------------|
+|  80000 |            481.26 |           78.41 |             464.41 |
++--------+-------------------+-----------------+--------------------+
+"""
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import register_env
+
+
+parser = add_rllib_example_script_args(
+    default_iters=40, default_reward=500.0, default_timesteps=200000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.num_agents == 2, "Must set --num-agents=2 when running this script!"
+
+    # Simple environment with n independent cartpole entities.
+    register_env(
+        "multi_agent_cartpole",
+        lambda _: MultiAgentCartPole({"num_agents": args.num_agents}),
+    )
+
+    base_config = (
+        PPOConfig()
+        .environment("multi_agent_cartpole")
+        .multi_agent(
+            policies={"learnable_policy", "random"},
+            # Map to either random behavior or PPO learning behavior based on
+            # the agent's ID.
+            policy_mapping_fn=lambda agent_id, *args, **kwargs: [
+                "learnable_policy",
+                "random",
+            ][agent_id % 2],
+            # We need to specify this here, b/c the `forward_train` method of
+            # `RandomRLModule` (ModuleID="random") throws a not-implemented error.
+            policies_to_train=["learnable_policy"],
+        )
+        .rl_module(
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={
+                    "learnable_policy": RLModuleSpec(),
+                    "random": RLModuleSpec(module_class=RandomRLModule),
+                }
+            ),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/different_spaces_for_agents.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/different_spaces_for_agents.py
new file mode 100644
index 0000000000000000000000000000000000000000..7331a3e3aadc39b53fe045aac4fc1352e1b5f7b5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/different_spaces_for_agents.py
@@ -0,0 +1,112 @@
+"""
+Example showing how to create a multi-agent env, in which the different agents
+have different observation and action spaces.
+
+These spaces do NOT necessarily have to be specified manually by the user. Instead,
+RLlib tries to automatically infer them from the env provided spaces dicts
+(agentID -> obs/act space) and the policy mapping fn (mapping agent IDs to policy IDs).
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+"""
+
+import gymnasium as gym
+
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+
+class BasicMultiAgentMultiSpaces(MultiAgentEnv):
+    """A simple multi-agent example environment where agents have different spaces.
+
+    agent0: obs=Box(10,), act=Discrete(2)
+    agent1: obs=Box(20,), act=Discrete(3)
+
+    The logic of the env doesn't really matter for this example. The point of this env
+    is to show how to use multi-agent envs, in which the different agents utilize
+    different obs- and action spaces.
+    """
+
+    def __init__(self, config=None):
+        self.agents = ["agent0", "agent1"]
+
+        self.terminateds = set()
+        self.truncateds = set()
+
+        # Provide full (preferred format) observation- and action-spaces as Dicts
+        # mapping agent IDs to the individual agents' spaces.
+        self.observation_spaces = {
+            "agent0": gym.spaces.Box(low=-1.0, high=1.0, shape=(10,)),
+            "agent1": gym.spaces.Box(low=-1.0, high=1.0, shape=(20,)),
+        }
+        self.action_spaces = {
+            "agent0": gym.spaces.Discrete(2),
+            "agent1": gym.spaces.Discrete(3),
+        }
+
+        super().__init__()
+
+    def reset(self, *, seed=None, options=None):
+        self.terminateds = set()
+        self.truncateds = set()
+        return {i: self.get_observation_space(i).sample() for i in self.agents}, {}
+
+    def step(self, action_dict):
+        obs, rew, terminated, truncated, info = {}, {}, {}, {}, {}
+        for i, action in action_dict.items():
+            obs[i] = self.get_observation_space(i).sample()
+            rew[i] = 0.0
+            terminated[i] = False
+            truncated[i] = False
+            info[i] = {}
+        terminated["__all__"] = len(self.terminateds) == len(self.agents)
+        truncated["__all__"] = len(self.truncateds) == len(self.agents)
+        return obs, rew, terminated, truncated, info
+
+
+parser = add_rllib_example_script_args(
+    default_iters=10, default_reward=80.0, default_timesteps=10000
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(env=BasicMultiAgentMultiSpaces)
+        .training(train_batch_size=1024)
+        .multi_agent(
+            # Use a simple set of policy IDs. Spaces for the individual policies
+            # are inferred automatically using reverse lookup via the
+            # `policy_mapping_fn` and the env provided spaces for the different
+            # agents. Alternatively, you could use:
+            # policies: {main0: PolicySpec(...), main1: PolicySpec}
+            policies={"main0", "main1"},
+            # Simple mapping fn, mapping agent0 to main0 and agent1 to main1.
+            policy_mapping_fn=(lambda aid, episode, **kw: f"main{aid[-1]}"),
+            # Only train main0.
+            policies_to_train=["main0"],
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bdf019f10b17cc26bd31bd5c516d4de0fe443f5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_cartpole.py
@@ -0,0 +1,67 @@
+"""Simple example of setting up an agent-to-module mapping function.
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+Control the number of agents and policies (RLModules) via --num-agents and
+--num-policies.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+"""
+
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+parser = add_rllib_example_script_args(
+    default_iters=200,
+    default_timesteps=100000,
+    default_reward=600.0,
+)
+# TODO (sven): This arg is currently ignored (hard-set to 2).
+parser.add_argument("--num-policies", type=int, default=2)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}),
+        )
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("env" if args.num_agents > 0 else "CartPole-v1")
+        .env_runners(
+            # TODO (sven): MAEnvRunner does not support vectorized envs yet
+            #  due to gym's env checkers and non-compatability with RLlib's
+            #  MultiAgentEnv API.
+            num_envs_per_env_runner=1
+            if args.num_agents > 0
+            else 20,
+        )
+    )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_pendulum.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_pendulum.py
new file mode 100644
index 0000000000000000000000000000000000000000..985e55aada326bc68b1fa74f9595411611b8c12e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_pendulum.py
@@ -0,0 +1,73 @@
+"""Simple example of setting up an agent-to-module mapping function.
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+Control the number of agents and policies (RLModules) via --num-agents and
+--num-policies.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+"""
+
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+parser = add_rllib_example_script_args(
+    default_iters=200,
+    default_timesteps=100000,
+    default_reward=-400.0,
+)
+# TODO (sven): This arg is currently ignored (hard-set to 2).
+parser.add_argument("--num-policies", type=int, default=2)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}),
+        )
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("env" if args.num_agents > 0 else "Pendulum-v1")
+        .training(
+            train_batch_size_per_learner=512,
+            minibatch_size=64,
+            lambda_=0.1,
+            gamma=0.95,
+            lr=0.0003,
+            model={"fcnet_activation": "relu"},
+            vf_clip_param=10.0,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(fcnet_activation="relu"),
+        )
+    )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    # Augment
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_independent_learning.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_independent_learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b25115cb7a41aad15ef46f85380b6dcc83299a2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_independent_learning.py
@@ -0,0 +1,108 @@
+"""Runs the PettingZoo Waterworld env in RLlib using independent multi-agent learning.
+
+See: https://pettingzoo.farama.org/environments/sisl/waterworld/
+for more details on the environment.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+Control the number of agents and policies (RLModules) via --num-agents and
+--num-policies.
+
+This works with hundreds of agents and policies, but note that initializing
+many policies might take some time.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+The above options can reach a combined reward of 0.0 or more after about 500k env
+timesteps. Keep in mind, though, that due to the separate value functions (and
+learned policies in general), one agent's gain (in per-agent reward) might cause the
+other agent's reward to decrease at the same time. However, over time, both agents
+should simply improve.
+
++---------------------+------------+-----------------+--------+------------------+
+| Trial name          | status     | loc             |   iter |   total time (s) |
+|---------------------+------------+-----------------+--------+------------------+
+| PPO_env_a82fc_00000 | TERMINATED | 127.0.0.1:28346 |    124 |          363.599 |
++---------------------+------------+-----------------+--------+------------------+
+
++--------+-------------------+--------------------+--------------------+
+|     ts |   combined reward |   reward pursuer_1 |   reward pursuer_0 |
++--------+-------------------+--------------------+--------------------|
+| 496000 |           2.24542 |           -34.6869 |            36.9324 |
++--------+-------------------+--------------------+--------------------+
+
+Note that the two agents (`pursuer_0` and `pursuer_1`) are optimized on the exact same
+objective and thus differences in the rewards can be attributed to weight initialization
+(and sampling randomness) only.
+"""
+
+from pettingzoo.sisl import waterworld_v4
+
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+
+parser = add_rllib_example_script_args(
+    default_iters=200,
+    default_timesteps=1000000,
+    default_reward=0.0,
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.num_agents > 0, "Must set --num-agents > 0 when running this script!"
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    # Here, we use the "Agent Environment Cycle" (AEC) PettingZoo environment type.
+    # For a "Parallel" environment example, see the rock paper scissors examples
+    # in this same repository folder.
+    register_env("env", lambda _: PettingZooEnv(waterworld_v4.env()))
+
+    # Policies are called just like the agents (exact 1:1 mapping).
+    policies = {f"pursuer_{i}" for i in range(args.num_agents)}
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("env")
+        .multi_agent(
+            policies=policies,
+            # Exact 1:1 mapping from AgentID to ModuleID.
+            policy_mapping_fn=(lambda aid, *args, **kwargs: aid),
+        )
+        .training(
+            vf_loss_coeff=0.005,
+        )
+        .rl_module(
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={p: RLModuleSpec() for p in policies},
+            ),
+            model_config=DefaultModelConfig(vf_share_layers=True),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_parameter_sharing.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_parameter_sharing.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6eb4bda732e7ba9f41ba3cc39aae7e91acf7c37
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_parameter_sharing.py
@@ -0,0 +1,105 @@
+"""Runs the PettingZoo Waterworld multi-agent env in RLlib using single policy learning.
+
+Other than the `pettingzoo_independent_learning.py` example (in this same folder),
+this example simply trains a single policy (shared by all agents).
+
+See: https://pettingzoo.farama.org/environments/sisl/waterworld/
+for more details on the environment.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+Control the number of agents and policies (RLModules) via --num-agents and
+--num-policies.
+
+This works with hundreds of agents and policies, but note that initializing
+many policies might take some time.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+The above options can reach a combined reward of roughly ~0.0 after about 500k-1M env
+timesteps. Keep in mind, though, that in this setup, the agents do not have the
+opportunity to benefit from or even out other agents' mistakes (and behavior in general)
+as everyone is using the same policy. Hence, this example learns a more generic policy,
+which might be less specialized to certain "niche exploitation opportunities" inside
+the env:
+
++---------------------+----------+-----------------+--------+-----------------+
+| Trial name          | status   | loc             |   iter |  total time (s) |
+|---------------------+----------+-----------------+--------+-----------------+
+| PPO_env_91f49_00000 | RUNNING  | 127.0.0.1:63676 |    200 |         605.176 |
++---------------------+----------+-----------------+--------+-----------------+
+
++--------+-------------------+-------------+
+|     ts |   combined reward |   reward p0 |
++--------+-------------------+-------------|
+| 800000 |          0.323752 |    0.161876 |
++--------+-------------------+-------------+
+"""
+from pettingzoo.sisl import waterworld_v4
+
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+
+parser = add_rllib_example_script_args(
+    default_iters=200,
+    default_timesteps=1000000,
+    default_reward=0.0,
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.num_agents > 0, "Must set --num-agents > 0 when running this script!"
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    # Here, we use the "Agent Environment Cycle" (AEC) PettingZoo environment type.
+    # For a "Parallel" environment example, see the rock paper scissors examples
+    # in this same repository folder.
+    register_env("env", lambda _: PettingZooEnv(waterworld_v4.env()))
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("env")
+        .multi_agent(
+            policies={"p0"},
+            # All agents map to the exact same policy.
+            policy_mapping_fn=(lambda aid, *args, **kwargs: "p0"),
+        )
+        .training(
+            model={
+                "vf_share_layers": True,
+            },
+            vf_loss_coeff=0.005,
+        )
+        .rl_module(
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={"p0": RLModuleSpec()},
+            ),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_shared_value_function.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_shared_value_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c8bb9a4ffb933f99f32c65eaa167e0d75e196d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_shared_value_function.py
@@ -0,0 +1,7 @@
+msg = """
+This script is NOT yet ready, but will be available soon at this location. It will
+feature a MultiRLModule with one shared value function and n policy heads for
+cooperative multi-agent learning.
+"""
+
+raise NotImplementedError(msg)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f474e8e3c69ad699b9e67890cecddb96446c068
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py
@@ -0,0 +1,154 @@
+"""A simple multi-agent env with two agents play rock paper scissors.
+
+This demonstrates running the following policies in competition:
+    Agent 1: heuristic policy of repeating the same move
+             OR: heuristic policy of beating the last opponent move
+    Agent 2: Simple, feedforward PPO policy
+             OR: PPO Policy with an LSTM network
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2 [--use-lstm]?`
+
+Without `--use-lstm`, Agent 2 should quickly reach a reward of ~7.0, always
+beating the `always_same` policy, but only 50% of the time beating the `beat_last`
+policy.
+
+With `--use-lstm`, Agent 2 should eventually(!) reach a reward of >9.0 (always
+beating both the `always_same` policy and the `beat_last` policy).
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+"""
+
+import random
+
+import gymnasium as gym
+from pettingzoo.classic import rps_v2
+
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.rllib.examples.rl_modules.classes import (
+    AlwaysSameHeuristicRLM,
+    BeatLastHeuristicRLM,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+
+parser = add_rllib_example_script_args(
+    default_iters=50,
+    default_timesteps=200000,
+    default_reward=6.0,
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
+parser.add_argument(
+    "--use-lstm",
+    action="store_true",
+    help="Whether to use an LSTM wrapped module instead of a simple MLP one. With LSTM "
+    "the reward diff can reach 7.0, without only 5.0.",
+)
+
+
+register_env(
+    "pettingzoo_rps",
+    lambda _: ParallelPettingZooEnv(rps_v2.parallel_env()),
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.num_agents == 2, "Must set --num-agents=2 when running this script!"
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("pettingzoo_rps")
+        .env_runners(
+            env_to_module_connector=lambda env: (
+                # `agent_ids=...`: Only flatten obs for the learning RLModule.
+                FlattenObservations(multi_agent=True, agent_ids={"player_0"}),
+            ),
+        )
+        .multi_agent(
+            policies={"always_same", "beat_last", "learned"},
+            # Let learning Policy always play against either heuristic one:
+            # `always_same` or `beat_last`.
+            policy_mapping_fn=lambda aid, episode: (
+                "learned"
+                if aid == "player_0"
+                else random.choice(["always_same", "beat_last"])
+            ),
+            # Must define this as both heuristic RLMs will throw an error, if their
+            # `forward_train` is called.
+            policies_to_train=["learned"],
+        )
+        .training(
+            vf_loss_coeff=0.005,
+        )
+        .rl_module(
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={
+                    "always_same": RLModuleSpec(
+                        module_class=AlwaysSameHeuristicRLM,
+                        observation_space=gym.spaces.Discrete(4),
+                        action_space=gym.spaces.Discrete(3),
+                    ),
+                    "beat_last": RLModuleSpec(
+                        module_class=BeatLastHeuristicRLM,
+                        observation_space=gym.spaces.Discrete(4),
+                        action_space=gym.spaces.Discrete(3),
+                    ),
+                    "learned": RLModuleSpec(
+                        model_config=DefaultModelConfig(
+                            use_lstm=args.use_lstm,
+                            # Use a simpler FCNet when we also have an LSTM.
+                            fcnet_hiddens=[32] if args.use_lstm else [256, 256],
+                            lstm_cell_size=256,
+                            max_seq_len=15,
+                            vf_share_layers=True,
+                        ),
+                    ),
+                }
+            ),
+        )
+    )
+
+    # Make `args.stop_reward` "point" to the reward of the learned policy.
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/learned": args.stop_reward,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+    }
+
+    run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        success_metric={
+            f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/learned": (
+                args.stop_reward
+            ),
+        },
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf88dba985b4862767467764ff8e7f012fafb37
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py
@@ -0,0 +1,91 @@
+"""A simple multi-agent env with two agents play rock paper scissors.
+
+This demonstrates running two learning policies in competition, both using the same
+RLlib algorithm (PPO by default).
+
+The combined reward as well as individual rewards should roughly remain at 0.0 as no
+policy should - in the long run - be able to learn a better strategy than chosing
+actions at random. However, it could be possible that - for some time - one or the other
+policy can exploit a "stochastic weakness" of the opponent policy. For example a policy
+`A` learns that its opponent `B` has learnt to choose "paper" more often, which in
+return makes `A` choose "scissors" more often as a countermeasure.
+"""
+
+import re
+
+from pettingzoo.classic import rps_v2
+
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+
+parser = add_rllib_example_script_args(
+    default_iters=50,
+    default_timesteps=200000,
+    default_reward=6.0,
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
+parser.add_argument(
+    "--use-lstm",
+    action="store_true",
+    help="Whether to use an LSTM wrapped module instead of a simple MLP one. With LSTM "
+    "the reward diff can reach 7.0, without only 5.0.",
+)
+
+
+register_env(
+    "pettingzoo_rps",
+    lambda _: ParallelPettingZooEnv(rps_v2.parallel_env()),
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.num_agents == 2, "Must set --num-agents=2 when running this script!"
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("pettingzoo_rps")
+        .env_runners(
+            env_to_module_connector=lambda env: FlattenObservations(multi_agent=True),
+        )
+        .multi_agent(
+            policies={"p0", "p1"},
+            # `player_0` uses `p0`, `player_1` uses `p1`.
+            policy_mapping_fn=lambda aid, episode: re.sub("^player_", "p", aid),
+        )
+        .training(
+            vf_loss_coeff=0.005,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                use_lstm=args.use_lstm,
+                # Use a simpler FCNet when we also have an LSTM.
+                fcnet_hiddens=[32] if args.use_lstm else [256, 256],
+                lstm_cell_size=256,
+                max_seq_len=15,
+                vf_share_layers=True,
+            ),
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={
+                    "p0": RLModuleSpec(),
+                    "p1": RLModuleSpec(),
+                }
+            ),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py
new file mode 100644
index 0000000000000000000000000000000000000000..859d4d9c01ddacef54f0d78be555529714005dbb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py
@@ -0,0 +1,285 @@
+"""Example showing how to implement a league-based training workflow.
+
+Uses the open spiel adapter of RLlib with the "markov_soccer" game and
+a simplified multi-agent, league-based setup:
+https://deepmind.com/blog/article/AlphaStar-Grandmaster-level-in- \
+StarCraft-II-using-multi-agent-reinforcement-learning
+
+Our league consists of three groups of policies:
+- main policies: The current main policy plus prior versions of it.
+- main exploiters: Trained by playing only against different "main policies".
+- league exploiters: Trained by playing against any policy in the league.
+
+We start with 1 policy from each group, setting all 3 of these to an initial
+PPO policy and allowing all 3 policies to be trained.
+After each train update - via our custom callback - we decide for each
+trainable policy, whether to make a copy and freeze it. Frozen policies
+will not be altered anymore. However, they remain in the league for
+future matches against trainable policies.
+Matchmaking happens via a policy_mapping_fn, which needs to be altered
+after every change (addition) to the league. The mapping function
+randomly maps agents in a way, such that:
+- Frozen main exploiters play against the one (currently trainable) main
+  policy.
+- Trainable main exploiters play against any main policy (including already
+  frozen main policies).
+- Frozen league exploiters play against any trainable policy in the league.
+- Trainable league exploiters play against any policy in the league.
+
+After training for n iterations, a configurable number of episodes can
+be played by the user against the "main" agent on the command line.
+"""
+import functools
+
+import numpy as np
+import torch
+
+import ray
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel
+from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv
+from ray.rllib.examples.multi_agent.utils import (
+    ask_user_for_action,
+    SelfPlayLeagueBasedCallback,
+    SelfPlayLeagueBasedCallbackOldAPIStack,
+)
+from ray.rllib.examples._old_api_stack.policy.random_policy import RandomPolicy
+from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+open_spiel = try_import_open_spiel(error=True)
+pyspiel = try_import_pyspiel(error=True)
+
+# Import after try_import_open_spiel, so we can error out with hints
+from open_spiel.python.rl_environment import Environment  # noqa: E402
+
+
+parser = add_rllib_example_script_args(default_timesteps=2000000)
+parser.set_defaults(
+    env="markov_soccer",
+    num_env_runners=2,
+    checkpoint_freq=1,
+    checkpoint_at_end=True,
+)
+parser.add_argument(
+    "--win-rate-threshold",
+    type=float,
+    default=0.85,
+    help="Win-rate at which we setup another opponent by freezing the "
+    "current main policy and playing against a uniform distribution "
+    "of previously frozen 'main's from here on.",
+)
+parser.add_argument(
+    "--min-league-size",
+    type=float,
+    default=8,
+    help="Minimum number of policies/RLModules to consider the test passed. "
+    "The initial league size is 2: `main` and `random`. "
+    "`--min-league-size=3` thus means that one new policy/RLModule has been "
+    "added so far (b/c the `main` one has reached the `--win-rate-threshold "
+    "against the `random` Policy/RLModule).",
+)
+parser.add_argument(
+    "--num-episodes-human-play",
+    type=int,
+    default=0,
+    help="How many episodes to play against the user on the command "
+    "line after training has finished.",
+)
+parser.add_argument(
+    "--from-checkpoint",
+    type=str,
+    default=None,
+    help="Full path to a checkpoint file for restoring a previously saved "
+    "Algorithm state.",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    register_env(
+        "open_spiel_env",
+        lambda _: OpenSpielEnv(pyspiel.load_game(args.env)),
+    )
+
+    def policy_mapping_fn(agent_id, episode, worker=None, **kwargs):
+        # At first, only have main play against the random main exploiter.
+        return "main" if episode.episode_id % 2 == agent_id else "main_exploiter_0"
+
+    def agent_to_module_mapping_fn(agent_id, episode, **kwargs):
+        # At first, only have main play against the random main exploiter.
+        return "main" if hash(episode.id_) % 2 == agent_id else "main_exploiter_0"
+
+    def _get_multi_agent():
+        names = {
+            # Our main policy, we'd like to optimize.
+            "main",
+            # First frozen version of main (after we reach n% win-rate).
+            "main_0",
+            # Initial main exploiters (one random, one trainable).
+            "main_exploiter_0",
+            "main_exploiter_1",
+            # Initial league exploiters (one random, one trainable).
+            "league_exploiter_0",
+            "league_exploiter_1",
+        }
+        if args.enable_new_api_stack:
+            policies = names
+            spec = {
+                mid: RLModuleSpec(
+                    module_class=(
+                        RandomRLModule
+                        if mid in ["main_exploiter_0", "league_exploiter_0"]
+                        else None
+                    ),
+                    model_config=DefaultModelConfig(
+                        fcnet_hiddens=[1024, 1024],
+                        # fcnet_activation="tanh",
+                    ),
+                )
+                for mid in names
+            }
+        else:
+            policies = {
+                mid: PolicySpec(
+                    policy_class=(
+                        RandomPolicy
+                        if mid in ["main_exploiter_0", "league_exploiter_0"]
+                        else None
+                    )
+                )
+                for mid in names
+            }
+            spec = None
+        return {"policies": policies, "spec": spec}
+
+    config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("open_spiel_env")
+        # Set up the main piece in this experiment: The league-bases self-play
+        # callback, which controls adding new policies/Modules to the league and
+        # properly matching the different policies in the league with each other.
+        .callbacks(
+            functools.partial(
+                SelfPlayLeagueBasedCallback
+                if args.enable_new_api_stack
+                else SelfPlayLeagueBasedCallbackOldAPIStack,
+                win_rate_threshold=args.win_rate_threshold,
+            )
+        )
+        .env_runners(
+            num_envs_per_env_runner=1 if args.enable_new_api_stack else 5,
+        )
+        .training(
+            num_epochs=20,
+        )
+        .multi_agent(
+            # Initial policy map: All PPO. This will be expanded
+            # to more policy snapshots. This is done in the
+            # custom callback defined above (`LeagueBasedSelfPlayCallback`).
+            policies=_get_multi_agent()["policies"],
+            policy_mapping_fn=(
+                agent_to_module_mapping_fn
+                if args.enable_new_api_stack
+                else policy_mapping_fn
+            ),
+            # At first, only train main_0 (until good enough to win against
+            # random).
+            policies_to_train=["main"],
+        )
+        .rl_module(
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs=_get_multi_agent()["spec"]
+            ),
+        )
+    )
+
+    # Run everything as configured.
+    # Train the "main" policy to play really well using self-play.
+    results = None
+    if not args.from_checkpoint:
+        stop = {
+            NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+            TRAINING_ITERATION: args.stop_iters,
+            "league_size": args.min_league_size,
+        }
+        results = run_rllib_example_script_experiment(
+            config, args, stop=stop, keep_ray_up=True
+        )
+
+    # Restore trained Algorithm (set to non-explore behavior) and play against
+    # human on command line.
+    if args.num_episodes_human_play > 0:
+        num_episodes = 0
+        # Switch off exploration for better inference performance.
+        config.explore = False
+        algo = config.build()
+        if args.from_checkpoint:
+            algo.restore(args.from_checkpoint)
+        else:
+            checkpoint = results.get_best_result().checkpoint
+            if not checkpoint:
+                raise ValueError("No last checkpoint found in results!")
+            algo.restore(checkpoint)
+
+        if args.enable_new_api_stack:
+            rl_module = algo.get_module("main")
+
+        # Play from the command line against the trained agent
+        # in an actual (non-RLlib-wrapped) open-spiel env.
+        human_player = 1
+        env = Environment(args.env)
+
+        while num_episodes < args.num_episodes_human_play:
+            print("You play as {}".format("o" if human_player else "x"))
+            time_step = env.reset()
+            while not time_step.last():
+                player_id = time_step.observations["current_player"]
+                if player_id == human_player:
+                    action = ask_user_for_action(time_step)
+                else:
+                    obs = np.array(time_step.observations["info_state"][player_id])
+                    if args.enable_new_api_stack:
+                        action = np.argmax(
+                            rl_module.forward_inference(
+                                {"obs": torch.from_numpy(obs).unsqueeze(0).float()}
+                            )["action_dist_inputs"][0].numpy()
+                        )
+                    else:
+                        action = algo.compute_single_action(obs, policy_id="main")
+                    # In case computer chooses an invalid action, pick a
+                    # random one.
+                    legal = time_step.observations["legal_actions"][player_id]
+                    if action not in legal:
+                        action = np.random.choice(legal)
+                time_step = env.step([action])
+                print(f"\n{env.get_state}")
+
+            print(f"\n{env.get_state}")
+
+            print("End of game!")
+            if time_step.rewards[human_player] > 0:
+                print("You win")
+            elif time_step.rewards[human_player] < 0:
+                print("You lose")
+            else:
+                print("Draw")
+            # Switch order of players
+            human_player = 1 - human_player
+
+            num_episodes += 1
+
+        algo.stop()
+
+    ray.shutdown()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_with_open_spiel.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_with_open_spiel.py
new file mode 100644
index 0000000000000000000000000000000000000000..37be03d53622c7ffece749d5f5a11b17f3c19753
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_with_open_spiel.py
@@ -0,0 +1,245 @@
+"""Example showing how one can implement a simple self-play training workflow.
+
+Uses the open spiel adapter of RLlib with the "connect_four" game and
+a multi-agent setup with a "main" policy and n "main_v[x]" policies
+(x=version number), which are all at-some-point-frozen copies of
+"main". At the very beginning, "main" plays against RandomPolicy.
+
+Checks for the training progress after each training update via a custom
+callback. We simply measure the win rate of "main" vs the opponent
+("main_v[x]" or RandomPolicy at the beginning) by looking through the
+achieved rewards in the episodes in the train batch. If this win rate
+reaches some configurable threshold, we add a new policy to
+the policy map (a frozen copy of the current "main" one) and change the
+policy_mapping_fn to make new matches of "main" vs any of the previous
+versions of "main" (including the just added one).
+
+After training for n iterations, a configurable number of episodes can
+be played by the user against the "main" agent on the command line.
+"""
+
+import functools
+
+import numpy as np
+import torch
+
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel
+from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv
+from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule
+from ray.rllib.examples.multi_agent.utils import (
+    ask_user_for_action,
+    SelfPlayCallback,
+    SelfPlayCallbackOldAPIStack,
+)
+from ray.rllib.examples._old_api_stack.policy.random_policy import RandomPolicy
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+
+open_spiel = try_import_open_spiel(error=True)
+pyspiel = try_import_pyspiel(error=True)
+
+# Import after try_import_open_spiel, so we can error out with hints.
+from open_spiel.python.rl_environment import Environment  # noqa: E402
+
+
+parser = add_rllib_example_script_args(default_timesteps=2000000)
+parser.set_defaults(
+    env="connect_four",
+    checkpoint_freq=1,
+    checkpoint_at_end=True,
+)
+parser.add_argument(
+    "--win-rate-threshold",
+    type=float,
+    default=0.95,
+    help="Win-rate at which we setup another opponent by freezing the "
+    "current main policy and playing against a uniform distribution "
+    "of previously frozen 'main's from here on.",
+)
+parser.add_argument(
+    "--min-league-size",
+    type=float,
+    default=3,
+    help="Minimum number of policies/RLModules to consider the test passed. "
+    "The initial league size is 2: `main` and `random`. "
+    "`--min-league-size=3` thus means that one new policy/RLModule has been "
+    "added so far (b/c the `main` one has reached the `--win-rate-threshold "
+    "against the `random` Policy/RLModule).",
+)
+parser.add_argument(
+    "--num-episodes-human-play",
+    type=int,
+    default=10,
+    help="How many episodes to play against the user on the command "
+    "line after training has finished.",
+)
+parser.add_argument(
+    "--from-checkpoint",
+    type=str,
+    default=None,
+    help="Full path to a checkpoint file for restoring a previously saved "
+    "Algorithm state.",
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    register_env("open_spiel_env", lambda _: OpenSpielEnv(pyspiel.load_game(args.env)))
+
+    def agent_to_module_mapping_fn(agent_id, episode, **kwargs):
+        # agent_id = [0|1] -> module depends on episode ID
+        # This way, we make sure that both modules sometimes play agent0
+        # (start player) and sometimes agent1 (player to move 2nd).
+        return "main" if hash(episode.id_) % 2 == agent_id else "random"
+
+    def policy_mapping_fn(agent_id, episode, worker, **kwargs):
+        return "main" if episode.episode_id % 2 == agent_id else "random"
+
+    config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("open_spiel_env")
+        # Set up the main piece in this experiment: The league-bases self-play
+        # callback, which controls adding new policies/Modules to the league and
+        # properly matching the different policies in the league with each other.
+        .callbacks(
+            functools.partial(
+                (
+                    SelfPlayCallback
+                    if args.enable_new_api_stack
+                    else SelfPlayCallbackOldAPIStack
+                ),
+                win_rate_threshold=args.win_rate_threshold,
+            )
+        )
+        .env_runners(
+            num_env_runners=(args.num_env_runners or 2),
+            num_envs_per_env_runner=1 if args.enable_new_api_stack else 5,
+        )
+        .multi_agent(
+            # Initial policy map: Random and default algo one. This will be expanded
+            # to more policy snapshots taken from "main" against which "main"
+            # will then play (instead of "random"). This is done in the
+            # custom callback defined above (`SelfPlayCallback`).
+            policies=(
+                {
+                    # Our main policy, we'd like to optimize.
+                    "main": PolicySpec(),
+                    # An initial random opponent to play against.
+                    "random": PolicySpec(policy_class=RandomPolicy),
+                }
+                if not args.enable_new_api_stack
+                else {"main", "random"}
+            ),
+            # Assign agent 0 and 1 randomly to the "main" policy or
+            # to the opponent ("random" at first). Make sure (via episode_id)
+            # that "main" always plays against "random" (and not against
+            # another "main").
+            policy_mapping_fn=(
+                agent_to_module_mapping_fn
+                if args.enable_new_api_stack
+                else policy_mapping_fn
+            ),
+            # Always just train the "main" policy.
+            policies_to_train=["main"],
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(fcnet_hiddens=[512, 512]),
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={
+                    "main": RLModuleSpec(),
+                    "random": RLModuleSpec(module_class=RandomRLModule),
+                }
+            ),
+        )
+    )
+
+    # Only for PPO, change the `num_epochs` setting.
+    if args.algo == "PPO":
+        config.training(num_epochs=20)
+
+    stop = {
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+        TRAINING_ITERATION: args.stop_iters,
+        "league_size": args.min_league_size,
+    }
+
+    # Train the "main" policy to play really well using self-play.
+    results = None
+    if not args.from_checkpoint:
+        results = run_rllib_example_script_experiment(
+            config, args, stop=stop, keep_ray_up=True
+        )
+
+    # Restore trained Algorithm (set to non-explore behavior) and play against
+    # human on command line.
+    if args.num_episodes_human_play > 0:
+        num_episodes = 0
+        config.explore = False
+        algo = config.build()
+        if args.from_checkpoint:
+            algo.restore(args.from_checkpoint)
+        else:
+            checkpoint = results.get_best_result().checkpoint
+            if not checkpoint:
+                raise ValueError("No last checkpoint found in results!")
+            algo.restore(checkpoint)
+
+        if args.enable_new_api_stack:
+            rl_module = algo.get_module("main")
+
+        # Play from the command line against the trained agent
+        # in an actual (non-RLlib-wrapped) open-spiel env.
+        human_player = 1
+        env = Environment(args.env)
+
+        while num_episodes < args.num_episodes_human_play:
+            print("You play as {}".format("o" if human_player else "x"))
+            time_step = env.reset()
+            while not time_step.last():
+                player_id = time_step.observations["current_player"]
+                if player_id == human_player:
+                    action = ask_user_for_action(time_step)
+                else:
+                    obs = np.array(time_step.observations["info_state"][player_id])
+                    if args.enable_new_api_stack:
+                        action = np.argmax(
+                            rl_module.forward_inference(
+                                {"obs": torch.from_numpy(obs).unsqueeze(0).float()}
+                            )["action_dist_inputs"][0].numpy()
+                        )
+                    else:
+                        action = algo.compute_single_action(obs, policy_id="main")
+                    # In case computer chooses an invalid action, pick a
+                    # random one.
+                    legal = time_step.observations["legal_actions"][player_id]
+                    if action not in legal:
+                        action = np.random.choice(legal)
+                time_step = env.step([action])
+                print(f"\n{env.get_state}")
+
+            print(f"\n{env.get_state}")
+
+            print("End of game!")
+            if time_step.rewards[human_player] > 0:
+                print("You win")
+            elif time_step.rewards[human_player] < 0:
+                print("You lose")
+            else:
+                print("Draw")
+            # Switch order of players.
+            human_player = 1 - human_player
+
+            num_episodes += 1
+
+        algo.stop()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py
new file mode 100644
index 0000000000000000000000000000000000000000..0981eb2575f10137e9e93edbc4b19dd3f6e57898
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py
@@ -0,0 +1,92 @@
+"""The two-step game from the QMIX paper:
+https://arxiv.org/pdf/1803.11485.pdf
+
+See also: rllib/examples/centralized_critic.py for centralized critic PPO on this game.
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2`
+
+Note that in this script, we use an multi-agent environment in which both
+agents that normally play this game have been merged into one agent with ID
+"agents" and observation- and action-spaces being 2-tupled (1 item for each
+agent). The "agents" agent is mapped to the policy with ID "p0".
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+Which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should expect a reward of 8.0 (the max to reach in thie game) eventually
+being achieved by a simple PPO policy (no tuning, just using RLlib's default settings):
+
++---------------------------------+------------+-----------------+--------+
+| Trial name                      | status     | loc             |   iter |
+|---------------------------------+------------+-----------------+--------+
+| PPO_grouped_twostep_4354b_00000 | TERMINATED | 127.0.0.1:42602 |     20 |
++---------------------------------+------------+-----------------+--------+
+
++------------------+-------+-------------------+-------------+
+|   total time (s) |    ts |   combined reward |   reward p0 |
++------------------+-------+-------------------+-------------|
+|          87.5756 | 80000 |                 8 |           8 |
++------------------+-------+-------------------+-------------+
+"""
+
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.multi_agent.two_step_game import (
+    TwoStepGameWithGroupedAgents,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import register_env, get_trainable_cls
+
+
+parser = add_rllib_example_script_args(default_reward=7.0)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert args.num_agents == 2, "Must set --num-agents=2 when running this script!"
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    register_env(
+        "grouped_twostep",
+        lambda config: TwoStepGameWithGroupedAgents(config),
+    )
+
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("grouped_twostep")
+        .env_runners(
+            env_to_module_connector=lambda env: FlattenObservations(multi_agent=True),
+        )
+        .multi_agent(
+            policies={"p0"},
+            policy_mapping_fn=lambda aid, *a, **kw: "p0",
+        )
+        .rl_module(
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={
+                    "p0": RLModuleSpec(),
+                },
+            )
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f61ce378bce38dcc3a1a07b33f0df450829168
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__init__.py
@@ -0,0 +1,43 @@
+import sys
+
+from ray.rllib.examples.multi_agent.utils.self_play_callback import SelfPlayCallback
+from ray.rllib.examples.multi_agent.utils.self_play_league_based_callback import (
+    SelfPlayLeagueBasedCallback,
+)
+from ray.rllib.examples.multi_agent.utils.self_play_callback_old_api_stack import (
+    SelfPlayCallbackOldAPIStack,
+)
+from ray.rllib.examples.multi_agent.utils.self_play_league_based_callback_old_api_stack import (  # noqa
+    SelfPlayLeagueBasedCallbackOldAPIStack,
+)
+
+
+def ask_user_for_action(time_step):
+    """Asks the user for a valid action on the command line and returns it.
+
+    Re-queries the user until she picks a valid one.
+
+    Args:
+        time_step: The open spiel Environment time-step object.
+    """
+    pid = time_step.observations["current_player"]
+    legal_moves = time_step.observations["legal_actions"][pid]
+    choice = -1
+    while choice not in legal_moves:
+        print("Choose an action from {}:".format(legal_moves))
+        sys.stdout.flush()
+        choice_str = input()
+        try:
+            choice = int(choice_str)
+        except ValueError:
+            continue
+    return choice
+
+
+__all__ = [
+    "ask_user_for_action",
+    "SelfPlayCallback",
+    "SelfPlayLeagueBasedCallback",
+    "SelfPlayCallbackOldAPIStack",
+    "SelfPlayLeagueBasedCallbackOldAPIStack",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b06fa95f029bade71419733b6664cd6295c98ef
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f53b11698ed8954795fea1f603eb5ede88264002
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback_old_api_stack.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback_old_api_stack.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2870188b6274b3146cddf4e95f275e21a42a8994
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback_old_api_stack.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8232c1be80a137bde641a02f55fa0b9b89a64c7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback_old_api_stack.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback_old_api_stack.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8398c4ccc930f38594ccbd42ed4185c982848e1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback_old_api_stack.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..436c3c2d198266b1e6faa1d283ffc67e622154e6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback.py
@@ -0,0 +1,96 @@
+from collections import defaultdict
+
+import numpy as np
+
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS
+
+
+class SelfPlayCallback(RLlibCallback):
+    def __init__(self, win_rate_threshold):
+        super().__init__()
+        # 0=RandomPolicy, 1=1st main policy snapshot,
+        # 2=2nd main policy snapshot, etc..
+        self.current_opponent = 0
+
+        self.win_rate_threshold = win_rate_threshold
+
+        # Report the matchup counters (who played against whom?).
+        self._matching_stats = defaultdict(int)
+
+    def on_episode_end(
+        self,
+        *,
+        episode,
+        env_runner,
+        metrics_logger,
+        env,
+        env_index,
+        rl_module,
+        **kwargs,
+    ) -> None:
+        # Compute the win rate for this episode and log it with a window of 100.
+        main_agent = 0 if episode.module_for(0) == "main" else 1
+        rewards = episode.get_rewards()
+        if main_agent in rewards:
+            main_won = rewards[main_agent][-1] == 1.0
+            metrics_logger.log_value(
+                "win_rate",
+                main_won,
+                window=100,
+            )
+
+    def on_train_result(self, *, algorithm, metrics_logger=None, result, **kwargs):
+        win_rate = result[ENV_RUNNER_RESULTS]["win_rate"]
+        print(f"Iter={algorithm.iteration} win-rate={win_rate} -> ", end="")
+        # If win rate is good -> Snapshot current policy and play against
+        # it next, keeping the snapshot fixed and only improving the "main"
+        # policy.
+        if win_rate > self.win_rate_threshold:
+            self.current_opponent += 1
+            new_module_id = f"main_v{self.current_opponent}"
+            print(f"adding new opponent to the mix ({new_module_id}).")
+
+            # Re-define the mapping function, such that "main" is forced
+            # to play against any of the previously played modules
+            # (excluding "random").
+            def agent_to_module_mapping_fn(agent_id, episode, **kwargs):
+                # agent_id = [0|1] -> policy depends on episode ID
+                # This way, we make sure that both modules sometimes play
+                # (start player) and sometimes agent1 (player to move 2nd).
+                opponent = "main_v{}".format(
+                    np.random.choice(list(range(1, self.current_opponent + 1)))
+                )
+                if hash(episode.id_) % 2 == agent_id:
+                    self._matching_stats[("main", opponent)] += 1
+                    return "main"
+                else:
+                    return opponent
+
+            main_module = algorithm.get_module("main")
+            algorithm.add_module(
+                module_id=new_module_id,
+                module_spec=RLModuleSpec.from_module(main_module),
+                new_agent_to_module_mapping_fn=agent_to_module_mapping_fn,
+            )
+            # TODO (sven): Maybe we should move this convenience step back into
+            #  `Algorithm.add_module()`? Would be less explicit, but also easier.
+            algorithm.set_state(
+                {
+                    "learner_group": {
+                        "learner": {
+                            "rl_module": {
+                                new_module_id: main_module.get_state(),
+                            }
+                        }
+                    }
+                }
+            )
+        else:
+            print("not good enough; will keep learning ...")
+
+        # +2 = main + random
+        result["league_size"] = self.current_opponent + 2
+
+        print(f"Matchups:\n{self._matching_stats}")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b05b94501708de34f3653150e384b3c95f4818
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py
@@ -0,0 +1,78 @@
+import numpy as np
+
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS
+
+
+@Deprecated(help="Use the example for the new RLlib API stack.", error=False)
+class SelfPlayCallbackOldAPIStack(RLlibCallback):
+    def __init__(self, win_rate_threshold):
+        super().__init__()
+        # 0=RandomPolicy, 1=1st main policy snapshot,
+        # 2=2nd main policy snapshot, etc..
+        self.current_opponent = 0
+
+        self.win_rate_threshold = win_rate_threshold
+
+    def on_train_result(self, *, algorithm, result, **kwargs):
+        # Get the win rate for the train batch.
+        # Note that normally, you should set up a proper evaluation config,
+        # such that evaluation always happens on the already updated policy,
+        # instead of on the already used train_batch.
+        main_rew = result[ENV_RUNNER_RESULTS]["hist_stats"].pop("policy_main_reward")
+        opponent_rew = list(result[ENV_RUNNER_RESULTS]["hist_stats"].values())[0]
+        assert len(main_rew) == len(opponent_rew)
+        won = 0
+        for r_main, r_opponent in zip(main_rew, opponent_rew):
+            if r_main > r_opponent:
+                won += 1
+        win_rate = won / len(main_rew)
+        result["win_rate"] = win_rate
+        print(f"Iter={algorithm.iteration} win-rate={win_rate} -> ", end="")
+        # If win rate is good -> Snapshot current policy and play against
+        # it next, keeping the snapshot fixed and only improving the "main"
+        # policy.
+        if win_rate > self.win_rate_threshold:
+            self.current_opponent += 1
+            new_pol_id = f"main_v{self.current_opponent}"
+            print(f"adding new opponent to the mix ({new_pol_id}).")
+
+            # Re-define the mapping function, such that "main" is forced
+            # to play against any of the previously played policies
+            # (excluding "random").
+            def policy_mapping_fn(agent_id, episode, worker, **kwargs):
+                # agent_id = [0|1] -> policy depends on episode ID
+                # This way, we make sure that both policies sometimes play
+                # (start player) and sometimes agent1 (player to move 2nd).
+                return (
+                    "main"
+                    if episode.episode_id % 2 == agent_id
+                    else "main_v{}".format(
+                        np.random.choice(list(range(1, self.current_opponent + 1)))
+                    )
+                )
+
+            main_policy = algorithm.get_policy("main")
+            new_policy = algorithm.add_policy(
+                policy_id=new_pol_id,
+                policy_cls=type(main_policy),
+                policy_mapping_fn=policy_mapping_fn,
+                config=main_policy.config,
+                observation_space=main_policy.observation_space,
+                action_space=main_policy.action_space,
+            )
+
+            # Set the weights of the new policy to the main policy.
+            # We'll keep training the main policy, whereas `new_pol_id` will
+            # remain fixed.
+            main_state = main_policy.get_state()
+            new_policy.set_state(main_state)
+            # We need to sync the just copied local weights (from main policy)
+            # to all the remote workers as well.
+            algorithm.env_runner_group.sync_weights()
+        else:
+            print("not good enough; will keep learning ...")
+
+        # +2 = main + random
+        result["league_size"] = self.current_opponent + 2
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b46dc29c5aee5ccc93a036d54cf5b65c304046f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback.py
@@ -0,0 +1,275 @@
+from collections import defaultdict
+from pprint import pprint
+import re
+
+import numpy as np
+
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS
+
+
+class SelfPlayLeagueBasedCallback(RLlibCallback):
+    def __init__(self, win_rate_threshold):
+        super().__init__()
+        # All policies in the league.
+        self.main_policies = {"main", "main_0"}
+        self.main_exploiters = {"main_exploiter_0", "main_exploiter_1"}
+        self.league_exploiters = {"league_exploiter_0", "league_exploiter_1"}
+        # Set of currently trainable policies in the league.
+        self.trainable_policies = {"main"}
+        # Set of currently non-trainable (frozen) policies in the league.
+        self.non_trainable_policies = {
+            "main_0",
+            "league_exploiter_0",
+            "main_exploiter_0",
+        }
+        # The win-rate value reaching of which leads to a new module being added
+        # to the leage (frozen copy of main).
+        self.win_rate_threshold = win_rate_threshold
+        # Store the win rates for league overview printouts.
+        self.win_rates = {}
+
+        # Report the matchup counters (who played against whom?).
+        self._matching_stats = defaultdict(int)
+
+    def on_episode_end(
+        self,
+        *,
+        episode,
+        env_runner,
+        metrics_logger,
+        env,
+        env_index,
+        rl_module,
+        **kwargs,
+    ) -> None:
+        num_learning_policies = (
+            episode.module_for(0) in env_runner.config.policies_to_train
+        ) + (episode.module_for(1) in env_runner.config.policies_to_train)
+        # Make sure the mapping function doesn't match two non-trainables together.
+        # This would be a waste of EnvRunner resources.
+        # assert num_learning_policies > 0
+        # Ignore matches between two learning policies and don't count win-rates for
+        # these.
+        assert num_learning_policies > 0, (
+            f"agent=0 -> mod={episode.module_for(0)}; "
+            f"agent=1 -> mod={episode.module_for(1)}; "
+            f"EnvRunner.config.policies_to_train={env_runner.config.policies_to_train}"
+        )
+        if num_learning_policies == 1:
+            # Compute the win rate for this episode (only looking at non-trained
+            # opponents, such as random or frozen policies) and log it with some window.
+            rewards_dict = episode.get_rewards()
+            for aid, rewards in rewards_dict.items():
+                mid = episode.module_for(aid)
+                won = rewards[-1] == 1.0
+                metrics_logger.log_value(
+                    f"win_rate_{mid}",
+                    won,
+                    window=100,
+                )
+
+    def on_train_result(self, *, algorithm, metrics_logger=None, result, **kwargs):
+        local_worker = algorithm.env_runner
+
+        # Avoid `self` being pickled into the remote function below.
+        _trainable_policies = self.trainable_policies
+
+        # Get the win rate for the train batch.
+        # Note that normally, one should set up a proper evaluation config,
+        # such that evaluation always happens on the already updated policy,
+        # instead of on the already used train_batch.
+        league_changed = False
+        keys = [
+            k for k in result[ENV_RUNNER_RESULTS].keys() if k.startswith("win_rate_")
+        ]
+        for key in keys:
+            module_id = key[9:]
+            self.win_rates[module_id] = result[ENV_RUNNER_RESULTS][key]
+
+            # Policy is frozen; ignore.
+            if module_id in self.non_trainable_policies:
+                continue
+
+            print(
+                f"Iter={algorithm.iteration} {module_id}'s "
+                f"win-rate={self.win_rates[module_id]} -> ",
+                end="",
+            )
+
+            # If win rate is good -> Snapshot current policy and decide,
+            # whether to freeze the copy or not.
+            if self.win_rates[module_id] > self.win_rate_threshold:
+                is_main = re.match("^main(_\\d+)?$", module_id)
+                initializing_exploiters = False
+
+                # First time, main manages a decent win-rate against random:
+                # Add league_exploiter_1 and main_exploiter_1 as trainables to the mix.
+                if is_main and len(self.trainable_policies) == 1:
+                    initializing_exploiters = True
+                    self.trainable_policies.add("league_exploiter_1")
+                    self.trainable_policies.add("main_exploiter_1")
+                # If main manages to win (above threshold) against the entire league
+                # -> increase the league by another frozen copy of main,
+                # main-exploiters or league-exploiters.
+                else:
+                    keep_training = (
+                        False
+                        if is_main
+                        else np.random.choice([True, False], p=[0.3, 0.7])
+                    )
+                    if module_id in self.main_policies:
+                        new_mod_id = re.sub(
+                            "(main)(_\\d+)?$",
+                            f"\\1_{len(self.main_policies) - 1}",
+                            module_id,
+                        )
+                        self.main_policies.add(new_mod_id)
+                    elif module_id in self.main_exploiters:
+                        new_mod_id = re.sub(
+                            "_\\d+$", f"_{len(self.main_exploiters)}", module_id
+                        )
+                        self.main_exploiters.add(new_mod_id)
+                    else:
+                        new_mod_id = re.sub(
+                            "_\\d+$", f"_{len(self.league_exploiters)}", module_id
+                        )
+                        self.league_exploiters.add(new_mod_id)
+
+                    if keep_training:
+                        self.trainable_policies.add(new_mod_id)
+                    else:
+                        self.non_trainable_policies.add(new_mod_id)
+
+                    print(f"adding new opponents to the mix ({new_mod_id}).")
+
+                # Update our mapping function accordingly.
+                def agent_to_module_mapping_fn(agent_id, episode, **kwargs):
+                    # Pick, whether this is ...
+                    type_ = np.random.choice([1, 2])
+
+                    # 1) League exploiter vs any other.
+                    if type_ == 1:
+                        league_exploiter = "league_exploiter_" + str(
+                            np.random.choice(list(range(len(self.league_exploiters))))
+                        )
+                        # This league exploiter is frozen: Play against a
+                        # trainable policy.
+                        if league_exploiter not in self.trainable_policies:
+                            opponent = np.random.choice(list(self.trainable_policies))
+                        # League exploiter is trainable: Play against any other
+                        # non-trainable policy.
+                        else:
+                            opponent = np.random.choice(
+                                list(self.non_trainable_policies)
+                            )
+
+                        # Only record match stats once per match.
+                        if hash(episode.id_) % 2 == agent_id:
+                            self._matching_stats[(league_exploiter, opponent)] += 1
+                            return league_exploiter
+                        else:
+                            return opponent
+
+                    # 2) Main exploiter vs main.
+                    else:
+                        main_exploiter = "main_exploiter_" + str(
+                            np.random.choice(list(range(len(self.main_exploiters))))
+                        )
+                        # Main exploiter is frozen: Play against the main
+                        # policy.
+                        if main_exploiter not in self.trainable_policies:
+                            main = "main"
+                        # Main exploiter is trainable: Play against any
+                        # frozen main.
+                        else:
+                            main = np.random.choice(list(self.main_policies - {"main"}))
+
+                        # Only record match stats once per match.
+                        if hash(episode.id_) % 2 == agent_id:
+                            self._matching_stats[(main_exploiter, main)] += 1
+                            return main_exploiter
+                        else:
+                            return main
+
+                multi_rl_module = local_worker.module
+                main_module = multi_rl_module["main"]
+
+                # Set the weights of the new polic(y/ies).
+                if initializing_exploiters:
+                    main_state = main_module.get_state()
+                    multi_rl_module["main_0"].set_state(main_state)
+                    multi_rl_module["league_exploiter_1"].set_state(main_state)
+                    multi_rl_module["main_exploiter_1"].set_state(main_state)
+                    # We need to sync the just copied local weights to all the
+                    # remote workers and remote Learner workers as well.
+                    algorithm.env_runner_group.sync_weights(
+                        policies=["main_0", "league_exploiter_1", "main_exploiter_1"]
+                    )
+                    algorithm.learner_group.set_weights(multi_rl_module.get_state())
+                else:
+                    algorithm.add_module(
+                        module_id=new_mod_id,
+                        module_spec=RLModuleSpec.from_module(main_module),
+                    )
+                    # TODO (sven): Maybe we should move this convenience step back into
+                    #  `Algorithm.add_module()`? Would be less explicit, but also
+                    #  easier.
+                    algorithm.set_state(
+                        {
+                            "learner_group": {
+                                "learner": {
+                                    "rl_module": {
+                                        new_mod_id: multi_rl_module[
+                                            module_id
+                                        ].get_state(),
+                                    }
+                                }
+                            }
+                        }
+                    )
+
+                algorithm.env_runner_group.foreach_env_runner(
+                    lambda env_runner: env_runner.config.multi_agent(
+                        policy_mapping_fn=agent_to_module_mapping_fn,
+                        # This setting doesn't really matter for EnvRunners (no
+                        # training going on there, but we'll update this as well
+                        # here for good measure).
+                        policies_to_train=_trainable_policies,
+                    ),
+                    local_env_runner=True,
+                )
+                # Set all Learner workers' should_module_be_updated to the new
+                # value.
+                algorithm.learner_group.foreach_learner(
+                    func=lambda learner: learner.config.multi_agent(
+                        policies_to_train=_trainable_policies,
+                    ),
+                    timeout_seconds=0.0,  # fire-and-forget
+                )
+                league_changed = True
+            else:
+                print("not good enough; will keep learning ...")
+
+        # Add current league size to results dict.
+        result["league_size"] = len(self.non_trainable_policies) + len(
+            self.trainable_policies
+        )
+
+        if league_changed:
+            self._print_league()
+
+    def _print_league(self):
+        print("--- League ---")
+        print("Matchups:")
+        pprint(self._matching_stats)
+        print("Trainable policies (win-rates):")
+        for p in sorted(self.trainable_policies):
+            wr = self.win_rates[p] if p in self.win_rates else 0.0
+            print(f"\t{p}: {wr}")
+        print("Frozen policies:")
+        for p in sorted(self.non_trainable_policies):
+            wr = self.win_rates[p] if p in self.win_rates else 0.0
+            print(f"\t{p}: {wr}")
+        print()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc39fa8fac9a7ba658aefdd2edeb4199097aaa2b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py
@@ -0,0 +1,201 @@
+import re
+
+import numpy as np
+
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS
+
+
+@Deprecated(help="Use the example for the new RLlib API stack", error=False)
+class SelfPlayLeagueBasedCallbackOldAPIStack(RLlibCallback):
+    def __init__(self, win_rate_threshold):
+        super().__init__()
+        # All policies in the league.
+        self.main_policies = {"main", "main_0"}
+        self.main_exploiters = {"main_exploiter_0", "main_exploiter_1"}
+        self.league_exploiters = {"league_exploiter_0", "league_exploiter_1"}
+        # Set of currently trainable policies in the league.
+        self.trainable_policies = {"main"}
+        # Set of currently non-trainable (frozen) policies in the league.
+        self.non_trainable_policies = {
+            "main_0",
+            "league_exploiter_0",
+            "main_exploiter_0",
+        }
+        # The win-rate value reaching of which leads to a new module being added
+        # to the leage (frozen copy of main).
+        self.win_rate_threshold = win_rate_threshold
+        # Store the win rates for league overview printouts.
+        self.win_rates = {}
+
+    def on_train_result(self, *, algorithm, result, **kwargs):
+        # Avoid `self` being pickled into the remote function below.
+        _trainable_policies = self.trainable_policies
+
+        # Get the win rate for the train batch.
+        # Note that normally, you should set up a proper evaluation config,
+        # such that evaluation always happens on the already updated policy,
+        # instead of on the already used train_batch.
+        for policy_id, rew in result[ENV_RUNNER_RESULTS]["hist_stats"].items():
+            mo = re.match("^policy_(.+)_reward$", policy_id)
+            if mo is None:
+                continue
+            policy_id = mo.group(1)
+
+            # Calculate this policy's win rate.
+            won = 0
+            for r in rew:
+                if r > 0.0:  # win = 1.0; loss = -1.0
+                    won += 1
+            win_rate = won / len(rew)
+            self.win_rates[policy_id] = win_rate
+
+            # Policy is frozen; ignore.
+            if policy_id in self.non_trainable_policies:
+                continue
+
+            print(
+                f"Iter={algorithm.iteration} {policy_id}'s " f"win-rate={win_rate} -> ",
+                end="",
+            )
+
+            # If win rate is good -> Snapshot current policy and decide,
+            # whether to freeze the copy or not.
+            if win_rate > self.win_rate_threshold:
+                is_main = re.match("^main(_\\d+)?$", policy_id)
+                initializing_exploiters = False
+
+                # First time, main manages a decent win-rate against random:
+                # Add league_exploiter_0 and main_exploiter_0 to the mix.
+                if is_main and len(self.trainable_policies) == 1:
+                    initializing_exploiters = True
+                    self.trainable_policies.add("league_exploiter_0")
+                    self.trainable_policies.add("main_exploiter_0")
+                else:
+                    keep_training = (
+                        False
+                        if is_main
+                        else np.random.choice([True, False], p=[0.3, 0.7])
+                    )
+                    if policy_id in self.main_policies:
+                        new_pol_id = re.sub(
+                            "_\\d+$", f"_{len(self.main_policies) - 1}", policy_id
+                        )
+                        self.main_policies.add(new_pol_id)
+                    elif policy_id in self.main_exploiters:
+                        new_pol_id = re.sub(
+                            "_\\d+$", f"_{len(self.main_exploiters)}", policy_id
+                        )
+                        self.main_exploiters.add(new_pol_id)
+                    else:
+                        new_pol_id = re.sub(
+                            "_\\d+$", f"_{len(self.league_exploiters)}", policy_id
+                        )
+                        self.league_exploiters.add(new_pol_id)
+
+                    if keep_training:
+                        self.trainable_policies.add(new_pol_id)
+                    else:
+                        self.non_trainable_policies.add(new_pol_id)
+
+                    print(f"adding new opponents to the mix ({new_pol_id}).")
+
+                # Update our mapping function accordingly.
+                def policy_mapping_fn(agent_id, episode, worker=None, **kwargs):
+                    # Pick, whether this is ...
+                    type_ = np.random.choice([1, 2])
+
+                    # 1) League exploiter vs any other.
+                    if type_ == 1:
+                        league_exploiter = "league_exploiter_" + str(
+                            np.random.choice(list(range(len(self.league_exploiters))))
+                        )
+                        # This league exploiter is frozen: Play against a
+                        # trainable policy.
+                        if league_exploiter not in self.trainable_policies:
+                            opponent = np.random.choice(list(self.trainable_policies))
+                        # League exploiter is trainable: Play against any other
+                        # non-trainable policy.
+                        else:
+                            opponent = np.random.choice(
+                                list(self.non_trainable_policies)
+                            )
+                        print(f"{league_exploiter} vs {opponent}")
+                        return (
+                            league_exploiter
+                            if episode.episode_id % 2 == agent_id
+                            else opponent
+                        )
+
+                    # 2) Main exploiter vs main.
+                    else:
+                        main_exploiter = "main_exploiter_" + str(
+                            np.random.choice(list(range(len(self.main_exploiters))))
+                        )
+                        # Main exploiter is frozen: Play against the main
+                        # policy.
+                        if main_exploiter not in self.trainable_policies:
+                            main = "main"
+                        # Main exploiter is trainable: Play against any
+                        # frozen main.
+                        else:
+                            main = np.random.choice(list(self.main_policies - {"main"}))
+                        # print(f"{main_exploiter} vs {main}")
+                        return (
+                            main_exploiter
+                            if episode.episode_id % 2 == agent_id
+                            else main
+                        )
+
+                # Set the weights of the new polic(y/ies).
+                if initializing_exploiters:
+                    main_state = algorithm.get_policy("main").get_state()
+                    pol_map = algorithm.env_runner.policy_map
+                    pol_map["main_0"].set_state(main_state)
+                    pol_map["league_exploiter_1"].set_state(main_state)
+                    pol_map["main_exploiter_1"].set_state(main_state)
+                    # We need to sync the just copied local weights to all the
+                    # remote workers as well.
+                    algorithm.env_runner_group.sync_weights(
+                        policies=["main_0", "league_exploiter_1", "main_exploiter_1"]
+                    )
+
+                    def _set(worker):
+                        worker.set_policy_mapping_fn(policy_mapping_fn)
+                        worker.set_is_policy_to_train(_trainable_policies)
+
+                    algorithm.env_runner_group.foreach_env_runner(_set)
+                else:
+                    base_pol = algorithm.get_policy(policy_id)
+                    new_policy = algorithm.add_policy(
+                        policy_id=new_pol_id,
+                        policy_cls=type(base_pol),
+                        policy_mapping_fn=policy_mapping_fn,
+                        policies_to_train=self.trainable_policies,
+                        config=base_pol.config,
+                        observation_space=base_pol.observation_space,
+                        action_space=base_pol.action_space,
+                    )
+                    main_state = base_pol.get_state()
+                    new_policy.set_state(main_state)
+                    # We need to sync the just copied local weights to all the
+                    # remote workers as well.
+                    algorithm.env_runner_group.sync_weights(policies=[new_pol_id])
+
+                self._print_league()
+
+            else:
+                print("not good enough; will keep learning ...")
+
+    def _print_league(self):
+        print("--- League ---")
+        print("Trainable policies (win-rates):")
+        for p in sorted(self.trainable_policies):
+            wr = self.win_rates[p] if p in self.win_rates else 0.0
+            print(f"\t{p}: {wr}")
+        print("Frozen policies:")
+        for p in sorted(self.non_trainable_policies):
+            wr = self.win_rates[p] if p in self.win_rates else 0.0
+            print(f"\t{p}: {wr}")
+        print()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..084ee7f6c3197a12922b22b23d82fa304eac2f65
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/llm_tutorial/llm_recipes/models/hf-model-eval/llm-jp-v3-3.7b_ja-en2en-ja_3M-pairs_3.5e-5/iter_0000698/model-00004-of-00004.safetensors b/llm_tutorial/llm_recipes/models/hf-model-eval/llm-jp-v3-3.7b_ja-en2en-ja_3M-pairs_3.5e-5/iter_0000698/model-00004-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..430b574cafc8878c463b93236d0d80cf97902bd6
--- /dev/null
+++ b/llm_tutorial/llm_recipes/models/hf-model-eval/llm-jp-v3-3.7b_ja-en2en-ja_3M-pairs_3.5e-5/iter_0000698/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7115b612a01a71d23727b49a7377bafa5f567802b8024bdd5f5a227af192a3ab
+size 1223688320