diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9920b5ee9f060416d42a419e5374f077c9db147c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/__pycache__/columns.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/__pycache__/columns.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..123e07b609c5f6bc3153f0d01caf5e6007d4d7df
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/__pycache__/columns.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/columns.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/columns.py
new file mode 100644
index 0000000000000000000000000000000000000000..98cb8646913e2b333eab3243308b82419f05d9bc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/columns.py
@@ -0,0 +1,73 @@
+from ray.util.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+class Columns:
+    """Definitions of common column names for RL data, e.g. 'obs', 'rewards', etc..
+
+    Note that this replaces the `SampleBatch` and `Postprocessing` columns (of the same
+    name).
+    """
+
+    # Observation received from an environment after `reset()` or `step()`.
+    OBS = "obs"
+    # Infos received from an environment after `reset()` or `step()`.
+    INFOS = "infos"
+
+    # Action computed/sampled by an RLModule.
+    ACTIONS = "actions"
+    # Action actually sent to the (gymnasium) `Env.step()` method.
+    ACTIONS_FOR_ENV = "actions_for_env"
+    # Reward returned by `env.step()`.
+    REWARDS = "rewards"
+    # Termination signal received from an environment after `step()`.
+    TERMINATEDS = "terminateds"
+    # Truncation signal received from an environment after `step()` (e.g. because
+    # of a reached time limit).
+    TRUNCATEDS = "truncateds"
+
+    # Next observation: Only used by algorithms that need to look at TD-data for
+    # training, such as off-policy/DQN algos.
+    NEXT_OBS = "new_obs"
+
+    # Uniquely identifies an episode
+    EPS_ID = "eps_id"
+    AGENT_ID = "agent_id"
+    MODULE_ID = "module_id"
+
+    # The size of non-zero-padded data within a (e.g. LSTM) zero-padded
+    # (B, T, ...)-style train batch.
+    SEQ_LENS = "seq_lens"
+    # Episode timestep counter.
+    T = "t"
+
+    # Common extra RLModule output keys.
+    STATE_IN = "state_in"
+    NEXT_STATE_IN = "next_state_in"
+    STATE_OUT = "state_out"
+    NEXT_STATE_OUT = "next_state_out"
+    EMBEDDINGS = "embeddings"
+    ACTION_DIST_INPUTS = "action_dist_inputs"
+    ACTION_PROB = "action_prob"
+    ACTION_LOGP = "action_logp"
+
+    # Value function predictions.
+    VF_PREDS = "vf_preds"
+    # Values, predicted at one timestep beyond the last timestep taken.
+    # These are usually calculated via the value function network using the final
+    # observation (and in case of an RNN: the last returned internal state).
+    VALUES_BOOTSTRAPPED = "values_bootstrapped"
+
+    # Postprocessing columns.
+    ADVANTAGES = "advantages"
+    VALUE_TARGETS = "value_targets"
+
+    # Intrinsic rewards (learning with curiosity).
+    INTRINSIC_REWARDS = "intrinsic_rewards"
+    # Discounted sum of rewards till the end of the episode (or chunk).
+    RETURNS_TO_GO = "returns_to_go"
+
+    # Loss mask. If provided in a train batch, a Learner's compute_loss_for_module
+    # method should respect the False-set value in here and mask out the respective
+    # items form the loss.
+    LOSS_MASK = "loss_mask"
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1327a6d9267bcd621333ccef420e2c6e1e55b0b1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..033eeea46a075dbe19cb5c81322c5cd455c2bdc3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/catalog.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/catalog.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..598627d0ca8725373f89c89ba934cc06f278d236
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/catalog.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/configs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/configs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..093baff45d066d42d689b32280590f805b7a05d6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/__pycache__/configs.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/base.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb6304449a577a767085e2743a322d70b1af124
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/base.py
@@ -0,0 +1,444 @@
+import abc
+from typing import List, Optional, Tuple, Union
+
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.models.configs import ModelConfig
+from ray.rllib.core.models.specs.specs_base import Spec
+from ray.rllib.policy.rnn_sequencing import get_fold_unfold_fns
+from ray.rllib.utils.annotations import ExperimentalAPI, override
+from ray.rllib.utils.typing import TensorType
+from ray.util.annotations import DeveloperAPI
+
+# Top level keys that unify model i/o.
+ENCODER_OUT: str = "encoder_out"
+# For Actor-Critic algorithms, these signify data related to the actor and critic
+ACTOR: str = "actor"
+CRITIC: str = "critic"
+
+
+@ExperimentalAPI
+class Model(abc.ABC):
+    """Framework-agnostic base class for RLlib models.
+
+    Models are low-level neural network components that offer input- and
+    output-specification, a forward method, and a get_initial_state method. Models
+    are composed in RLModules.
+
+    Usage Example together with ModelConfig:
+
+    .. testcode::
+
+        from ray.rllib.core.models.base import Model
+        from ray.rllib.core.models.configs import ModelConfig
+        from dataclasses import dataclass
+
+        class MyModel(Model):
+            def __init__(self, config):
+                super().__init__(config)
+                self.my_param = config.my_param * 2
+
+            def _forward(self, input_dict):
+                return input_dict["obs"] * self.my_param
+
+
+        @dataclass
+        class MyModelConfig(ModelConfig):
+            my_param: int = 42
+
+            def build(self, framework: str):
+                if framework == "bork":
+                    return MyModel(self)
+
+
+        config = MyModelConfig(my_param=3)
+        model = config.build(framework="bork")
+        print(model._forward({"obs": 1}))
+
+    .. testoutput::
+
+        6
+
+    """
+
+    def __init__(self, config: ModelConfig):
+        self.config = config
+
+    def __init_subclass__(cls, **kwargs):
+        # Automatically add a __post_init__ method to all subclasses of Model.
+        # This method is called after the __init__ method of the subclass.
+        def init_decorator(previous_init):
+            def new_init(self, *args, **kwargs):
+                previous_init(self, *args, **kwargs)
+                if type(self) is cls:
+                    self.__post_init__()
+
+            return new_init
+
+        cls.__init__ = init_decorator(cls.__init__)
+
+    def __post_init__(self):
+        """Called automatically after the __init__ method of the subclasses.
+
+        The module first calls the __init__ method of the subclass, With in the
+        __init__ you should call the super().__init__ method. Then after the __init__
+        method of the subclass is called, the __post_init__ method is called.
+
+        This is a good place to do any initialization that requires access to the
+        subclass's attributes.
+        """
+        self._input_specs = self.get_input_specs()
+        self._output_specs = self.get_output_specs()
+
+    def get_input_specs(self) -> Optional[Spec]:
+        """Returns the input specs of this model.
+
+        Override `get_input_specs` to define your own input specs.
+        This method should not be called often, e.g. every forward pass.
+        Instead, it should be called once at instantiation to define Model.input_specs.
+
+        Returns:
+            Spec: The input specs.
+        """
+        return None
+
+    def get_output_specs(self) -> Optional[Spec]:
+        """Returns the output specs of this model.
+
+        Override `get_output_specs` to define your own output specs.
+        This method should not be called often, e.g. every forward pass.
+        Instead, it should be called once at instantiation to define Model.output_specs.
+
+        Returns:
+            Spec: The output specs.
+        """
+        return None
+
+    @property
+    def input_specs(self) -> Spec:
+        """Returns the input spec of this model."""
+        return self._input_specs
+
+    @input_specs.setter
+    def input_specs(self, spec: Spec) -> None:
+        raise ValueError(
+            "`input_specs` cannot be set directly. Override "
+            "Model.get_input_specs() instead. Set Model._input_specs if "
+            "you want to override this behavior."
+        )
+
+    @property
+    def output_specs(self) -> Spec:
+        """Returns the output specs of this model."""
+        return self._output_specs
+
+    @output_specs.setter
+    def output_specs(self, spec: Spec) -> None:
+        raise ValueError(
+            "`output_specs` cannot be set directly. Override "
+            "Model.get_output_specs() instead. Set Model._output_specs if "
+            "you want to override this behavior."
+        )
+
+    def get_initial_state(self) -> Union[dict, List[TensorType]]:
+        """Returns the initial state of the Model.
+
+        It can be left empty if this Model is not stateful.
+        """
+        return dict()
+
+    @abc.abstractmethod
+    def _forward(self, input_dict: dict, **kwargs) -> dict:
+        """Returns the output of this model for the given input.
+
+        This method is called by the forwarding method of the respective framework
+        that is itself wrapped by RLlib in order to check model inputs and outputs.
+
+        Args:
+            input_dict: The input tensors.
+            **kwargs: Forward compatibility kwargs.
+
+        Returns:
+            dict: The output tensors.
+        """
+
+    @abc.abstractmethod
+    def get_num_parameters(self) -> Tuple[int, int]:
+        """Returns a tuple of (num trainable params, num non-trainable params)."""
+
+    @abc.abstractmethod
+    def _set_to_dummy_weights(self, value_sequence=(-0.02, -0.01, 0.01, 0.02)) -> None:
+        """Helper method to set all weights to deterministic dummy values.
+
+        Calling this method on two `Models` that have the same architecture using
+        the exact same `value_sequence` arg should make both models output the exact
+        same values on arbitrary inputs. This will work, even if the two `Models`
+        are of different DL frameworks.
+
+        Args:
+            value_sequence: Looping through the list of all parameters (weight matrices,
+                bias tensors, etc..) of this model, in each iteration i, we set all
+                values in this parameter to `value_sequence[i % len(value_sequence)]`
+                (round robin).
+
+        Example:
+            TODO:
+        """
+
+
+@ExperimentalAPI
+class Encoder(Model, abc.ABC):
+    """The framework-agnostic base class for all RLlib encoders.
+
+    Encoders are used to transform observations to a latent space.
+    Therefore, their `input_specs` contains the observation space dimensions.
+    Similarly, their `output_specs` contains the latent space dimensions.
+    Encoders can be recurrent, in which case the state should be part of input- and
+    output_specs. The latent vectors produced by an encoder are fed into subsequent
+    "heads". Any implementation of Encoder should also be callable. This should be done
+    by also inheriting from a framework-specific model base-class, s.a. TorchModel or
+    TfModel.
+
+    Abstract illustration of typical flow of tensors:
+
+    Inputs
+    |
+    Encoder
+    |      \
+    Head1  Head2
+    |      /
+    Outputs
+
+    Outputs of encoders are generally of shape (B, latent_dim) or (B, T, latent_dim).
+    That is, for time-series data, we encode into the latent space for each time step.
+    This should be reflected in the `output_specs`.
+
+    Usage example together with a ModelConfig:
+
+    .. testcode::
+
+        from dataclasses import dataclass
+        import numpy as np
+
+        from ray.rllib.core.columns import Columns
+        from ray.rllib.core.models.base import Encoder, ENCODER_OUT
+        from ray.rllib.core.models.configs import ModelConfig
+        from ray.rllib.policy.sample_batch import SampleBatch
+
+        class NumpyEncoder(Encoder):
+            def __init__(self, config):
+                super().__init__(config)
+                self.factor = config.factor
+
+            def __call__(self, *args, **kwargs):
+                # This is a dummy method to do checked forward passes.
+                return self._forward(*args, **kwargs)
+
+            def _forward(self, input_dict, **kwargs):
+                obs = input_dict[Columns.OBS]
+                return {
+                    ENCODER_OUT: np.array(obs) * self.factor,
+                    Columns.STATE_OUT: (
+                        np.array(input_dict[Columns.STATE_IN])
+                        * self.factor
+                    ),
+                }
+
+        @dataclass
+        class NumpyEncoderConfig(ModelConfig):
+            factor: int = None
+
+            def build(self, framework: str):
+                return NumpyEncoder(self)
+
+        config = NumpyEncoderConfig(factor=2)
+        encoder = NumpyEncoder(config)
+        print(encoder({Columns.OBS: 1, Columns.STATE_IN: 2}))
+
+    .. testoutput::
+
+        {'encoder_out': 2, 'state_out': 4}
+
+    """
+
+    @abc.abstractmethod
+    def _forward(self, input_dict: dict, **kwargs) -> dict:
+        """Returns the latent of the encoder for the given inputs.
+
+        This method is called by the forwarding method of the respective framework
+        that is itself wrapped by RLlib in order to check model inputs and outputs.
+
+        The input dict contains at minimum the observation and the state of the encoder
+        (None for stateless encoders).
+        The output dict contains at minimum the latent and the state of the encoder
+        (None for stateless encoders).
+        To establish an agreement between the encoder and RLModules, these values
+        have the fixed keys `Columns.OBS` for the `input_dict`,
+        and `ACTOR` and `CRITIC` for the returned dict.
+
+        Args:
+            input_dict: The input tensors. Must contain at a minimum the keys
+                Columns.OBS and Columns.STATE_IN (which might be None for stateless
+                encoders).
+            **kwargs: Forward compatibility kwargs.
+
+        Returns:
+            The output tensors. Must contain at a minimum the key ENCODER_OUT.
+        """
+
+
+@ExperimentalAPI
+class ActorCriticEncoder(Encoder):
+    """An encoder that potentially holds two stateless encoders.
+
+    This is a special case of Encoder that can either enclose a single,
+    shared encoder or two separate encoders: One for the actor and one for the
+    critic. The two encoders are of the same type, and we can therefore make the
+    assumption that they have the same input and output specs.
+    """
+
+    framework = None
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__(config)
+
+        if config.shared:
+            self.encoder = config.base_encoder_config.build(framework=self.framework)
+        else:
+            self.actor_encoder = config.base_encoder_config.build(
+                framework=self.framework
+            )
+            self.critic_encoder = None
+            if not config.inference_only:
+                self.critic_encoder = config.base_encoder_config.build(
+                    framework=self.framework
+                )
+
+    @override(Model)
+    def _forward(self, inputs: dict, **kwargs) -> dict:
+        if self.config.shared:
+            encoder_outs = self.encoder(inputs, **kwargs)
+            return {
+                ENCODER_OUT: {
+                    ACTOR: encoder_outs[ENCODER_OUT],
+                    **(
+                        {}
+                        if self.config.inference_only
+                        else {CRITIC: encoder_outs[ENCODER_OUT]}
+                    ),
+                }
+            }
+        else:
+            # Encoders should not modify inputs, so we can pass the same inputs
+            actor_out = self.actor_encoder(inputs, **kwargs)
+            if self.critic_encoder:
+                critic_out = self.critic_encoder(inputs, **kwargs)
+
+            return {
+                ENCODER_OUT: {
+                    ACTOR: actor_out[ENCODER_OUT],
+                    **(
+                        {}
+                        if self.config.inference_only
+                        else {CRITIC: critic_out[ENCODER_OUT]}
+                    ),
+                }
+            }
+
+
+@ExperimentalAPI
+class StatefulActorCriticEncoder(Encoder):
+    """An encoder that potentially holds two potentially stateful encoders.
+
+    This is a special case of Encoder that can either enclose a single,
+    shared encoder or two separate encoders: One for the actor and one for the
+    critic. The two encoders are of the same type, and we can therefore make the
+    assumption that they have the same input and output specs.
+
+    If this encoder wraps a single encoder, state in input- and output dicts
+    is simply stored under the key `STATE_IN` and `STATE_OUT`, respectively.
+    If this encoder wraps two encoders, state in input- and output dicts is
+    stored under the keys `(STATE_IN, ACTOR)` and `(STATE_IN, CRITIC)` and
+    `(STATE_OUT, ACTOR)` and `(STATE_OUT, CRITIC)`, respectively.
+    """
+
+    framework = None
+
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__(config)
+
+        if config.shared:
+            self.encoder = config.base_encoder_config.build(framework=self.framework)
+        else:
+            self.actor_encoder = config.base_encoder_config.build(
+                framework=self.framework
+            )
+            self.critic_encoder = config.base_encoder_config.build(
+                framework=self.framework
+            )
+
+    @override(Model)
+    def get_initial_state(self):
+        if self.config.shared:
+            return self.encoder.get_initial_state()
+        else:
+            return {
+                ACTOR: self.actor_encoder.get_initial_state(),
+                CRITIC: self.critic_encoder.get_initial_state(),
+            }
+
+    @override(Model)
+    def _forward(self, inputs: dict, **kwargs) -> dict:
+        outputs = {}
+
+        if self.config.shared:
+            outs = self.encoder(inputs, **kwargs)
+            encoder_out = outs.pop(ENCODER_OUT)
+            outputs[ENCODER_OUT] = {ACTOR: encoder_out, CRITIC: encoder_out}
+            outputs[Columns.STATE_OUT] = outs[Columns.STATE_OUT]
+        else:
+            # Shallow copy inputs so that we can add states without modifying
+            # original dict.
+            actor_inputs = inputs.copy()
+            critic_inputs = inputs.copy()
+            actor_inputs[Columns.STATE_IN] = inputs[Columns.STATE_IN][ACTOR]
+            critic_inputs[Columns.STATE_IN] = inputs[Columns.STATE_IN][CRITIC]
+
+            actor_out = self.actor_encoder(actor_inputs, **kwargs)
+            critic_out = self.critic_encoder(critic_inputs, **kwargs)
+
+            outputs[ENCODER_OUT] = {
+                ACTOR: actor_out[ENCODER_OUT],
+                CRITIC: critic_out[ENCODER_OUT],
+            }
+
+            outputs[Columns.STATE_OUT] = {
+                ACTOR: actor_out[Columns.STATE_OUT],
+                CRITIC: critic_out[Columns.STATE_OUT],
+            }
+
+        return outputs
+
+
+@DeveloperAPI
+def tokenize(tokenizer: Encoder, inputs: dict, framework: str) -> dict:
+    """Tokenizes the observations from the input dict.
+
+    Args:
+        tokenizer: The tokenizer to use.
+        inputs: The input dict.
+
+    Returns:
+        The output dict.
+    """
+    # Tokenizer may depend solely on observations.
+    obs = inputs[Columns.OBS]
+    tokenizer_inputs = {Columns.OBS: obs}
+    size = list(obs.size() if framework == "torch" else obs.shape)
+    b_dim, t_dim = size[:2]
+    fold, unfold = get_fold_unfold_fns(b_dim, t_dim, framework=framework)
+    # Push through the tokenizer encoder.
+    out = tokenizer(fold(tokenizer_inputs))
+    out = out[ENCODER_OUT]
+    # Then unfold batch- and time-dimensions again.
+    return unfold(out)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/catalog.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..136dd713e01aff7183ba8dee308a2474ac8c6a9f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/catalog.py
@@ -0,0 +1,667 @@
+import dataclasses
+import enum
+import functools
+from typing import Optional
+
+import gymnasium as gym
+import numpy as np
+import tree
+from gymnasium.spaces import Box, Dict, Discrete, MultiDiscrete, Tuple
+
+from ray.rllib.core.models.base import Encoder
+from ray.rllib.core.models.configs import (
+    CNNEncoderConfig,
+    MLPEncoderConfig,
+    RecurrentEncoderConfig,
+)
+from ray.rllib.core.models.configs import ModelConfig
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.models.distributions import Distribution
+from ray.rllib.models.preprocessors import get_preprocessor, Preprocessor
+from ray.rllib.models.utils import get_filter_config
+from ray.rllib.utils.deprecation import deprecation_warning, DEPRECATED_VALUE
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.spaces.simplex import Simplex
+from ray.rllib.utils.spaces.space_utils import flatten_space
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.annotations import (
+    OverrideToImplementCustomLogic,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+
+
+class Catalog:
+    """Describes the sub-module-architectures to be used in RLModules.
+
+    RLlib's native RLModules get their Models from a Catalog object.
+    By default, that Catalog builds the configs it has as attributes.
+    This component was build to be hackable and extensible. You can inject custom
+    components into RL Modules by overriding the `build_xxx` methods of this class.
+    Note that it is recommended to write a custom RL Module for a single use-case.
+    Modifications to Catalogs mostly make sense if you want to reuse the same
+    Catalog for different RL Modules. For example if you have written a custom
+    encoder and want to inject it into different RL Modules (e.g. for PPO, DQN, etc.).
+    You can influence the decision tree that determines the sub-components by modifying
+    `Catalog._determine_components_hook`.
+
+    Usage example:
+
+    # Define a custom catalog
+
+    .. testcode::
+
+        import torch
+        import gymnasium as gym
+        from ray.rllib.core.models.configs import MLPHeadConfig
+        from ray.rllib.core.models.catalog import Catalog
+
+        class MyCatalog(Catalog):
+            def __init__(
+                self,
+                observation_space: gym.Space,
+                action_space: gym.Space,
+                model_config_dict: dict,
+            ):
+                super().__init__(observation_space, action_space, model_config_dict)
+                self.my_model_config = MLPHeadConfig(
+                    hidden_layer_dims=[64, 32],
+                    input_dims=[self.observation_space.shape[0]],
+                )
+
+            def build_my_head(self, framework: str):
+                return self.my_model_config.build(framework=framework)
+
+        # With that, RLlib can build and use models from this catalog like this:
+        catalog = MyCatalog(gym.spaces.Box(0, 1), gym.spaces.Box(0, 1), {})
+        my_head = catalog.build_my_head(framework="torch")
+
+        # Make a call to the built model.
+        out = my_head(torch.Tensor([[1]]))
+    """
+
+    # TODO (Sven): Add `framework` arg to c'tor and remove this arg from `build`
+    #  methods. This way, we can already know in the c'tor of Catalog, what the exact
+    #  action distibution objects are and thus what the output dims for e.g. a pi-head
+    #  will be.
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        model_config_dict: dict,
+        # deprecated args.
+        view_requirements=DEPRECATED_VALUE,
+    ):
+        """Initializes a Catalog with a default encoder config.
+
+        Args:
+            observation_space: The observation space of the environment.
+            action_space: The action space of the environment.
+            model_config_dict: The model config that specifies things like hidden
+                dimensions and activations functions to use in this Catalog.
+        """
+        if view_requirements != DEPRECATED_VALUE:
+            deprecation_warning(old="Catalog(view_requirements=..)", error=True)
+
+        # TODO (sven): The following logic won't be needed anymore, once we get rid of
+        #  Catalogs entirely. We will assert directly inside the algo's DefaultRLModule
+        #  class that the `model_config` is a DefaultModelConfig. Thus users won't be
+        #  able to pass in partial config dicts into a default model (alternatively, we
+        #  could automatically augment the user provided dict by the default config
+        #  dataclass object only(!) for default modules).
+        if dataclasses.is_dataclass(model_config_dict):
+            model_config_dict = dataclasses.asdict(model_config_dict)
+        default_config = dataclasses.asdict(DefaultModelConfig())
+        # end: TODO
+
+        self.observation_space = observation_space
+        self.action_space = action_space
+
+        self._model_config_dict = default_config | model_config_dict
+        self._latent_dims = None
+
+        self._determine_components_hook()
+
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def _determine_components_hook(self):
+        """Decision tree hook for subclasses to override.
+
+        By default, this method executes the decision tree that determines the
+        components that a Catalog builds. You can extend the components by overriding
+        this or by adding to the constructor of your subclass.
+
+        Override this method if you don't want to use the default components
+        determined here. If you want to use them but add additional components, you
+        should call `super()._determine_components()` at the beginning of your
+        implementation.
+
+        This makes it so that subclasses are not forced to create an encoder config
+        if the rest of their catalog is not dependent on it or if it breaks.
+        At the end of this method, an attribute `Catalog.latent_dims`
+        should be set so that heads can be built using that information.
+        """
+        self._encoder_config = self._get_encoder_config(
+            observation_space=self.observation_space,
+            action_space=self.action_space,
+            model_config_dict=self._model_config_dict,
+        )
+
+        # Create a function that can be called when framework is known to retrieve the
+        # class type for action distributions
+        self._action_dist_class_fn = functools.partial(
+            self._get_dist_cls_from_action_space, action_space=self.action_space
+        )
+
+        # The dimensions of the latent vector that is output by the encoder and fed
+        # to the heads.
+        self.latent_dims = self._encoder_config.output_dims
+
+    @property
+    def latent_dims(self):
+        """Returns the latent dimensions of the encoder.
+
+        This establishes an agreement between encoder and heads about the latent
+        dimensions. Encoders can be built to output a latent tensor with
+        `latent_dims` dimensions, and heads can be built with tensors of
+        `latent_dims` dimensions as inputs. This can be safely ignored if this
+        agreement is not needed in case of modifications to the Catalog.
+
+        Returns:
+            The latent dimensions of the encoder.
+        """
+        return self._latent_dims
+
+    @latent_dims.setter
+    def latent_dims(self, value):
+        self._latent_dims = value
+
+    @OverrideToImplementCustomLogic
+    def build_encoder(self, framework: str) -> Encoder:
+        """Builds the encoder.
+
+        By default, this method builds an encoder instance from Catalog._encoder_config.
+
+        You should override this if you want to use RLlib's default RL Modules but
+        only want to change the encoder. For example, if you want to use a custom
+        encoder, but want to use RLlib's default heads, action distribution and how
+        tensors are routed between them. If you want to have full control over the
+        RL Module, we recommend writing your own RL Module by inheriting from one of
+        RLlib's RL Modules instead.
+
+        Args:
+            framework: The framework to use. Either "torch" or "tf2".
+
+        Returns:
+            The encoder.
+        """
+        assert hasattr(self, "_encoder_config"), (
+            "You must define a `Catalog._encoder_config` attribute in your Catalog "
+            "subclass or override the `Catalog.build_encoder` method. By default, "
+            "an encoder_config is created in the __post_init__ method."
+        )
+        return self._encoder_config.build(framework=framework)
+
+    @OverrideToImplementCustomLogic
+    def get_action_dist_cls(self, framework: str):
+        """Get the action distribution class.
+
+        The default behavior is to get the action distribution from the
+        `Catalog._action_dist_class_fn`.
+
+        You should override this to have RLlib build your custom action
+        distribution instead of the default one. For example, if you don't want to
+        use RLlib's default RLModules with their default models, but only want to
+        change the distribution that Catalog returns.
+
+        Args:
+            framework: The framework to use. Either "torch" or "tf2".
+
+        Returns:
+            The action distribution.
+        """
+        assert hasattr(self, "_action_dist_class_fn"), (
+            "You must define a `Catalog._action_dist_class_fn` attribute in your "
+            "Catalog subclass or override the `Catalog.action_dist_class_fn` method. "
+            "By default, an action_dist_class_fn is created in the __post_init__ "
+            "method."
+        )
+        return self._action_dist_class_fn(framework=framework)
+
+    @classmethod
+    def _get_encoder_config(
+        cls,
+        observation_space: gym.Space,
+        model_config_dict: dict,
+        action_space: gym.Space = None,
+    ) -> ModelConfig:
+        """Returns an EncoderConfig for the given input_space and model_config_dict.
+
+        Encoders are usually used in RLModules to transform the input space into a
+        latent space that is then fed to the heads. The returned EncoderConfig
+        objects correspond to the built-in Encoder classes in RLlib.
+        For example, for a simple 1D-Box input_space, RLlib offers an
+        MLPEncoder, hence this method returns the MLPEncoderConfig. You can overwrite
+        this method to produce specific EncoderConfigs for your custom Models.
+
+        The following input spaces lead to the following configs:
+        - 1D-Box: MLPEncoderConfig
+        - 3D-Box: CNNEncoderConfig
+        # TODO (Artur): Support more spaces here
+        # ...
+
+        Args:
+            observation_space: The observation space to use.
+            model_config_dict: The model config to use.
+            action_space: The action space to use if actions are to be encoded. This
+                is commonly the case for LSTM models.
+
+        Returns:
+            The encoder config.
+        """
+        activation = model_config_dict["fcnet_activation"]
+        output_activation = model_config_dict["fcnet_activation"]
+        use_lstm = model_config_dict["use_lstm"]
+
+        if use_lstm:
+            encoder_config = RecurrentEncoderConfig(
+                input_dims=observation_space.shape,
+                recurrent_layer_type="lstm",
+                hidden_dim=model_config_dict["lstm_cell_size"],
+                hidden_weights_initializer=model_config_dict["lstm_kernel_initializer"],
+                hidden_weights_initializer_config=model_config_dict[
+                    "lstm_kernel_initializer_kwargs"
+                ],
+                hidden_bias_initializer=model_config_dict["lstm_bias_initializer"],
+                hidden_bias_initializer_config=model_config_dict[
+                    "lstm_bias_initializer_kwargs"
+                ],
+                batch_major=True,
+                num_layers=1,
+                tokenizer_config=cls.get_tokenizer_config(
+                    observation_space,
+                    model_config_dict,
+                ),
+            )
+        else:
+            # TODO (Artur): Maybe check for original spaces here
+            # input_space is a 1D Box
+            if isinstance(observation_space, Box) and len(observation_space.shape) == 1:
+                # In order to guarantee backward compatability with old configs,
+                # we need to check if no latent dim was set and simply reuse the last
+                # fcnet hidden dim for that purpose.
+                hidden_layer_dims = model_config_dict["fcnet_hiddens"][:-1]
+                encoder_latent_dim = model_config_dict["fcnet_hiddens"][-1]
+                encoder_config = MLPEncoderConfig(
+                    input_dims=observation_space.shape,
+                    hidden_layer_dims=hidden_layer_dims,
+                    hidden_layer_activation=activation,
+                    hidden_layer_weights_initializer=model_config_dict[
+                        "fcnet_kernel_initializer"
+                    ],
+                    hidden_layer_weights_initializer_config=model_config_dict[
+                        "fcnet_kernel_initializer_kwargs"
+                    ],
+                    hidden_layer_bias_initializer=model_config_dict[
+                        "fcnet_bias_initializer"
+                    ],
+                    hidden_layer_bias_initializer_config=model_config_dict[
+                        "fcnet_bias_initializer_kwargs"
+                    ],
+                    output_layer_dim=encoder_latent_dim,
+                    output_layer_activation=output_activation,
+                    output_layer_weights_initializer=model_config_dict[
+                        "fcnet_kernel_initializer"
+                    ],
+                    output_layer_weights_initializer_config=model_config_dict[
+                        "fcnet_kernel_initializer_kwargs"
+                    ],
+                    output_layer_bias_initializer=model_config_dict[
+                        "fcnet_bias_initializer"
+                    ],
+                    output_layer_bias_initializer_config=model_config_dict[
+                        "fcnet_bias_initializer_kwargs"
+                    ],
+                )
+
+            # input_space is a 3D Box
+            elif (
+                isinstance(observation_space, Box) and len(observation_space.shape) == 3
+            ):
+                if not model_config_dict.get("conv_filters"):
+                    model_config_dict["conv_filters"] = get_filter_config(
+                        observation_space.shape
+                    )
+
+                encoder_config = CNNEncoderConfig(
+                    input_dims=observation_space.shape,
+                    cnn_filter_specifiers=model_config_dict["conv_filters"],
+                    cnn_activation=model_config_dict["conv_activation"],
+                    cnn_kernel_initializer=model_config_dict["conv_kernel_initializer"],
+                    cnn_kernel_initializer_config=model_config_dict[
+                        "conv_kernel_initializer_kwargs"
+                    ],
+                    cnn_bias_initializer=model_config_dict["conv_bias_initializer"],
+                    cnn_bias_initializer_config=model_config_dict[
+                        "conv_bias_initializer_kwargs"
+                    ],
+                )
+            # input_space is a 2D Box
+            elif (
+                isinstance(observation_space, Box) and len(observation_space.shape) == 2
+            ):
+                # RLlib used to support 2D Box spaces by silently flattening them
+                raise ValueError(
+                    f"No default encoder config for obs space={observation_space},"
+                    f" lstm={use_lstm} found. 2D Box "
+                    f"spaces are not supported. They should be either flattened to a "
+                    f"1D Box space or enhanced to be a 3D box space."
+                )
+            # input_space is a possibly nested structure of spaces.
+            else:
+                # NestedModelConfig
+                raise ValueError(
+                    f"No default encoder config for obs space={observation_space},"
+                    f" lstm={use_lstm} found."
+                )
+
+        return encoder_config
+
+    @classmethod
+    @OverrideToImplementCustomLogic
+    def get_tokenizer_config(
+        cls,
+        observation_space: gym.Space,
+        model_config_dict: dict,
+        # deprecated args.
+        view_requirements=DEPRECATED_VALUE,
+    ) -> ModelConfig:
+        """Returns a tokenizer config for the given space.
+
+        This is useful for recurrent / transformer models that need to tokenize their
+        inputs. By default, RLlib uses the models supported by Catalog out of the box to
+        tokenize.
+
+        You should override this method if you want to change the custom tokenizer
+        inside current encoders that Catalog returns without providing the recurrent
+        network as a whole. For example, if you want to define some custom CNN layers
+        as a tokenizer for a recurrent encoder that already includes the recurrent
+        layers and handles the state.
+
+        Args:
+            observation_space: The observation space to use.
+            model_config_dict: The model config to use.
+        """
+        if view_requirements != DEPRECATED_VALUE:
+            deprecation_warning(old="Catalog(view_requirements=..)", error=True)
+
+        return cls._get_encoder_config(
+            observation_space=observation_space,
+            # Use model_config_dict without flags that would end up in complex models
+            model_config_dict={
+                **model_config_dict,
+                **{"use_lstm": False, "use_attention": False},
+            },
+        )
+
+    @classmethod
+    def _get_dist_cls_from_action_space(
+        cls,
+        action_space: gym.Space,
+        *,
+        framework: Optional[str] = None,
+    ) -> Distribution:
+        """Returns a distribution class for the given action space.
+
+        You can get the required input dimension for the distribution by calling
+        `action_dict_cls.required_input_dim(action_space)`
+        on the retrieved class. This is useful, because the Catalog needs to find out
+        about the required input dimension for the distribution before the model that
+        outputs these inputs is configured.
+
+        Args:
+            action_space: Action space of the target gym env.
+            framework: The framework to use.
+
+        Returns:
+            The distribution class for the given action space.
+        """
+        # If no framework provided, return no action distribution class (None).
+        if framework is None:
+            return None
+        # This method is structured in two steps:
+        # Firstly, construct a dictionary containing the available distribution classes.
+        # Secondly, return the correct distribution class for the given action space.
+
+        # Step 1: Construct the dictionary.
+
+        class DistEnum(enum.Enum):
+            Categorical = "Categorical"
+            DiagGaussian = "Gaussian"
+            Deterministic = "Deterministic"
+            MultiDistribution = "MultiDistribution"
+            MultiCategorical = "MultiCategorical"
+
+        if framework == "torch":
+            from ray.rllib.models.torch.torch_distributions import (
+                TorchCategorical,
+                TorchDeterministic,
+                TorchDiagGaussian,
+            )
+
+            distribution_dicts = {
+                DistEnum.Deterministic: TorchDeterministic,
+                DistEnum.DiagGaussian: TorchDiagGaussian,
+                DistEnum.Categorical: TorchCategorical,
+            }
+        elif framework == "tf2":
+            from ray.rllib.models.tf.tf_distributions import (
+                TfCategorical,
+                TfDeterministic,
+                TfDiagGaussian,
+            )
+
+            distribution_dicts = {
+                DistEnum.Deterministic: TfDeterministic,
+                DistEnum.DiagGaussian: TfDiagGaussian,
+                DistEnum.Categorical: TfCategorical,
+            }
+        else:
+            raise ValueError(
+                f"Unknown framework: {framework}. Only 'torch' and 'tf2' are "
+                "supported for RLModule Catalogs."
+            )
+
+        # Only add a MultiAction distribution class to the dict if we can compute its
+        # components (we need a Tuple/Dict space for this).
+        if isinstance(action_space, (Tuple, Dict)):
+            partial_multi_action_distribution_cls = _multi_action_dist_partial_helper(
+                catalog_cls=cls,
+                action_space=action_space,
+                framework=framework,
+            )
+
+            distribution_dicts[
+                DistEnum.MultiDistribution
+            ] = partial_multi_action_distribution_cls
+
+        # Only add a MultiCategorical distribution class to the dict if we can compute
+        # its components (we need a MultiDiscrete space for this).
+        if isinstance(action_space, MultiDiscrete):
+            partial_multi_categorical_distribution_cls = (
+                _multi_categorical_dist_partial_helper(
+                    action_space=action_space,
+                    framework=framework,
+                )
+            )
+
+            distribution_dicts[
+                DistEnum.MultiCategorical
+            ] = partial_multi_categorical_distribution_cls
+
+        # Step 2: Return the correct distribution class for the given action space.
+
+        # Box space -> DiagGaussian OR Deterministic.
+        if isinstance(action_space, Box):
+            if action_space.dtype.char in np.typecodes["AllInteger"]:
+                raise ValueError(
+                    "Box(..., `int`) action spaces are not supported. "
+                    "Use MultiDiscrete  or Box(..., `float`)."
+                )
+            else:
+                if len(action_space.shape) > 1:
+                    raise UnsupportedSpaceException(
+                        f"Action space has multiple dimensions {action_space.shape}. "
+                        f"Consider reshaping this into a single dimension, using a "
+                        f"custom action distribution, using a Tuple action space, "
+                        f"or the multi-agent API."
+                    )
+                return distribution_dicts[DistEnum.DiagGaussian]
+
+        # Discrete Space -> Categorical.
+        elif isinstance(action_space, Discrete):
+            return distribution_dicts[DistEnum.Categorical]
+
+        # Tuple/Dict Spaces -> MultiAction.
+        elif isinstance(action_space, (Tuple, Dict)):
+            return distribution_dicts[DistEnum.MultiDistribution]
+
+        # Simplex -> Dirichlet.
+        elif isinstance(action_space, Simplex):
+            # TODO(Artur): Supported Simplex (in torch).
+            raise NotImplementedError("Simplex action space not yet supported.")
+
+        # MultiDiscrete -> MultiCategorical.
+        elif isinstance(action_space, MultiDiscrete):
+            return distribution_dicts[DistEnum.MultiCategorical]
+
+        # Unknown type -> Error.
+        else:
+            raise NotImplementedError(f"Unsupported action space: `{action_space}`")
+
+    @staticmethod
+    def get_preprocessor(observation_space: gym.Space, **kwargs) -> Preprocessor:
+        """Returns a suitable preprocessor for the given observation space.
+
+        Args:
+            observation_space: The input observation space.
+            **kwargs: Forward-compatible kwargs.
+
+        Returns:
+            preprocessor: Preprocessor for the observations.
+        """
+        # TODO(Artur): Since preprocessors have long been @PublicAPI with the options
+        #  kwarg as part of their constructor, we fade out support for this,
+        #  beginning with this entrypoint.
+        # Next, we should deprecate the `options` kwarg from the Preprocessor itself,
+        # after deprecating the old catalog and other components that still pass this.
+        options = kwargs.get("options", {})
+        if options:
+            deprecation_warning(
+                old="get_preprocessor_for_space(..., options={...})",
+                help="Override `Catalog.get_preprocessor()` "
+                "in order to implement custom behaviour.",
+                error=False,
+            )
+
+        if options.get("custom_preprocessor"):
+            deprecation_warning(
+                old="model_config['custom_preprocessor']",
+                help="Custom preprocessors are deprecated, "
+                "since they sometimes conflict with the built-in "
+                "preprocessors for handling complex observation spaces. "
+                "Please use wrapper classes around your environment "
+                "instead.",
+                error=True,
+            )
+        else:
+            # TODO(Artur): Inline the get_preprocessor() call here once we have
+            #  deprecated the old model catalog.
+            cls = get_preprocessor(observation_space)
+            prep = cls(observation_space, options)
+            return prep
+
+
+def _multi_action_dist_partial_helper(
+    catalog_cls: "Catalog", action_space: gym.Space, framework: str
+) -> Distribution:
+    """Helper method to get a partial of a MultiActionDistribution.
+
+    This is useful for when we want to create MultiActionDistributions from
+    logits only (!) later, but know the action space now already.
+
+    Args:
+        catalog_cls: The ModelCatalog class to use.
+        action_space: The action space to get the child distribution classes for.
+        framework: The framework to use.
+
+    Returns:
+        A partial of the TorchMultiActionDistribution class.
+    """
+    action_space_struct = get_base_struct_from_space(action_space)
+    flat_action_space = flatten_space(action_space)
+    child_distribution_cls_struct = tree.map_structure(
+        lambda s: catalog_cls._get_dist_cls_from_action_space(
+            action_space=s,
+            framework=framework,
+        ),
+        action_space_struct,
+    )
+    flat_distribution_clses = tree.flatten(child_distribution_cls_struct)
+
+    logit_lens = [
+        int(dist_cls.required_input_dim(space))
+        for dist_cls, space in zip(flat_distribution_clses, flat_action_space)
+    ]
+
+    if framework == "torch":
+        from ray.rllib.models.torch.torch_distributions import (
+            TorchMultiDistribution,
+        )
+
+        multi_action_dist_cls = TorchMultiDistribution
+    elif framework == "tf2":
+        from ray.rllib.models.tf.tf_distributions import TfMultiDistribution
+
+        multi_action_dist_cls = TfMultiDistribution
+    else:
+        raise ValueError(f"Unsupported framework: {framework}")
+
+    partial_dist_cls = multi_action_dist_cls.get_partial_dist_cls(
+        space=action_space,
+        child_distribution_cls_struct=child_distribution_cls_struct,
+        input_lens=logit_lens,
+    )
+    return partial_dist_cls
+
+
+def _multi_categorical_dist_partial_helper(
+    action_space: gym.Space, framework: str
+) -> Distribution:
+    """Helper method to get a partial of a MultiCategorical Distribution.
+
+    This is useful for when we want to create MultiCategorical Distribution from
+    logits only (!) later, but know the action space now already.
+
+    Args:
+        action_space: The action space to get the child distribution classes for.
+        framework: The framework to use.
+
+    Returns:
+        A partial of the MultiCategorical class.
+    """
+
+    if framework == "torch":
+        from ray.rllib.models.torch.torch_distributions import TorchMultiCategorical
+
+        multi_categorical_dist_cls = TorchMultiCategorical
+    elif framework == "tf2":
+        from ray.rllib.models.tf.tf_distributions import TfMultiCategorical
+
+        multi_categorical_dist_cls = TfMultiCategorical
+    else:
+        raise ValueError(f"Unsupported framework: {framework}")
+
+    partial_dist_cls = multi_categorical_dist_cls.get_partial_dist_cls(
+        space=action_space, input_lens=list(action_space.nvec)
+    )
+
+    return partial_dist_cls
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/configs.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..60a0758bbd76055e67e89a66da5185d5239d986e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/configs.py
@@ -0,0 +1,1095 @@
+import abc
+from dataclasses import dataclass, field
+import functools
+from typing import Callable, Dict, List, Optional, Tuple, TYPE_CHECKING, Union
+
+import numpy as np
+
+from ray.rllib.models.torch.misc import (
+    same_padding,
+    same_padding_transpose_after_stride,
+    valid_padding,
+)
+from ray.rllib.models.utils import get_activation_fn, get_initializer_fn
+from ray.rllib.utils.annotations import ExperimentalAPI
+
+if TYPE_CHECKING:
+    from ray.rllib.core.models.base import Model, Encoder
+
+
+@ExperimentalAPI
+def _framework_implemented(torch: bool = True, tf2: bool = True):
+    """Decorator to check if a model was implemented in a framework.
+
+    Args:
+        torch: Whether we can build this model with torch.
+        tf2: Whether we can build this model with tf2.
+
+    Returns:
+        The decorated function.
+
+    Raises:
+        ValueError: If the framework is not available to build.
+    """
+    accepted = []
+    if torch:
+        accepted.append("torch")
+    if tf2:
+        accepted.append("tf2")
+
+    def decorator(fn: Callable) -> Callable:
+        @functools.wraps(fn)
+        def checked_build(self, framework, **kwargs):
+            if framework not in accepted:
+                raise ValueError(
+                    f"This config does not support framework "
+                    f"{framework}. Only frameworks in {accepted} are "
+                    f"supported."
+                )
+            return fn(self, framework, **kwargs)
+
+        return checked_build
+
+    return decorator
+
+
+@ExperimentalAPI
+@dataclass
+class ModelConfig(abc.ABC):
+    """Base class for configuring a `Model` instance.
+
+    ModelConfigs are DL framework-agnostic.
+    A `Model` (as a sub-component of an `RLModule`) is built via calling the
+    respective ModelConfig's `build()` method.
+    RLModules build their sub-components this way after receiving one or more
+    `ModelConfig` instances from a Catalog object.
+
+    However, `ModelConfig` is not restricted to be used only with Catalog or RLModules.
+    Usage examples can be found in the individual Model classes', e.g.
+    see `ray.rllib.core.models.configs::MLPHeadConfig`.
+
+    Attributes:
+        input_dims: The input dimensions of the network
+        always_check_shapes: Whether to always check the inputs and outputs of the
+            model for the specifications. Input specifications are checked on failed
+            forward passes of the model regardless of this flag. If this flag is set
+            to `True`, inputs and outputs are checked on every call. This leads to
+            a slow-down and should only be used for debugging.
+    """
+
+    input_dims: Union[List[int], Tuple[int]] = None
+    always_check_shapes: bool = False
+
+    @abc.abstractmethod
+    def build(self, framework: str):
+        """Builds the model.
+
+        Args:
+            framework: The framework to use for building the model.
+        """
+        raise NotImplementedError
+
+    @property
+    def output_dims(self) -> Optional[Tuple[int]]:
+        """Read-only `output_dims` are inferred automatically from other settings."""
+        return None
+
+
+@ExperimentalAPI
+@dataclass
+class _MLPConfig(ModelConfig):
+    """Generic configuration class for multi-layer-perceptron based Model classes.
+
+    `output_dims` is reached by either the provided `output_layer_dim` setting (int) OR
+    by the last entry of `hidden_layer_dims`. In the latter case, no special output
+    layer is added and all layers in the stack behave exactly the same. If
+    `output_layer_dim` is provided, users might also change this last layer's
+    activation (`output_layer_activation`) and its bias setting
+    (`output_layer_use_bias`).
+
+    This is a private class as users should not configure their models directly
+    through this class, but use one of the sub-classes, e.g. `MLPHeadConfig` or
+    `MLPEncoderConfig`.
+
+    Attributes:
+        input_dims: A 1D tensor indicating the input dimension, e.g. `[32]`.
+        hidden_layer_dims: The sizes of the hidden layers. If an empty list,
+            `output_layer_dim` must be provided (int) and only a single layer will be
+            built.
+        hidden_layer_use_bias: Whether to use bias on all dense layers in the network
+            (excluding a possible separate output layer defined by `output_layer_dim`).
+        hidden_layer_activation: The activation function to use after each layer (
+            except for the output). The default activation for hidden layers is "relu".
+        hidden_layer_use_layernorm: Whether to insert a LayerNorm functionality
+            in between each hidden layer's output and its activation.
+        hidden_layer_weights_initializer: The initializer function or class to use for
+            weight initialization in the hidden layers. If `None` the default
+            initializer of the respective dense layer of a framework (`"torch"` or
+            `"tf2"`) is used. Note, all initializers defined in the framework `"tf2`)
+            are allowed. For `"torch"` only the in-place initializers, i.e. ending with
+            an underscore "_" are allowed.
+        hidden_layer_weights_initializer_config: Configuration to pass into the
+            initializer defined in `hidden_layer_weights_initializer`.
+        hidden_layer_bias_initializer: The initializer function or class to use for
+            bias initialization in the hidden layers. If `None` the default initializer
+            of the respective dense layer of a framework (`"torch"` or `"tf2"`) is used.
+            Note, all initializers defined in the framework `"tf2`) are allowed. For
+            `"torch"` only the in-place initializers, i.e. ending with an underscore "_"
+            are allowed.
+        hidden_layer_bias_initializer_config: Configuration to pass into the
+            initializer defined in `hidden_layer_bias_initializer`.
+        output_layer_dim: An int indicating the size of the output layer. This may be
+            set to `None` in case no extra output layer should be built and only the
+            layers specified by `hidden_layer_dims` will be part of the network.
+        output_layer_use_bias: Whether to use bias on the separate output layer, if any.
+        output_layer_activation: The activation function to use for the output layer,
+            if any. The default activation for the output layer, if any, is "linear",
+            meaning no activation.
+        output_layer_weights_initializer: The initializer function or class to use for
+            weight initialization in the output layers. If `None` the default
+            initializer of the respective dense layer of a framework (`"torch"` or `
+            "tf2"`) is used. Note, all initializers defined in the framework `"tf2`) are
+            allowed. For `"torch"` only the in-place initializers, i.e. ending with an
+            underscore "_" are allowed.
+        output_layer_weights_initializer_config: Configuration to pass into the
+            initializer defined in `output_layer_weights_initializer`.
+        output_layer_bias_initializer: The initializer function or class to use for
+            bias initialization in the output layers. If `None` the default initializer
+            of the respective dense layer of a framework (`"torch"` or `"tf2"`) is used.
+            For `"torch"` only the in-place initializers, i.e. ending with an underscore
+            "_" are allowed.
+        output_layer_bias_initializer_config: Configuration to pass into the
+            initializer defined in `output_layer_bias_initializer`.
+        clip_log_std: If log std should be clipped by `log_std_clip_param`. This applies
+            only to the action distribution parameters that encode the log standard
+            deviation of a `DiagGaussian` distribution.
+        log_std_clip_param: The clipping parameter for the log std, if clipping should
+            be applied - i.e. `clip_log_std=True`. The default value is 20, i.e. log
+            stds are clipped in between -20 and 20.
+    """
+
+    hidden_layer_dims: Union[List[int], Tuple[int]] = (256, 256)
+    hidden_layer_use_bias: bool = True
+    hidden_layer_activation: str = "relu"
+    hidden_layer_use_layernorm: bool = False
+    hidden_layer_weights_initializer: Optional[Union[str, Callable]] = None
+    hidden_layer_weights_initializer_config: Optional[Dict] = None
+    hidden_layer_bias_initializer: Optional[Union[str, Callable]] = None
+    hidden_layer_bias_initializer_config: Optional[Dict] = None
+
+    # Optional last output layer with - possibly - different activation and use_bias
+    # settings.
+    output_layer_dim: Optional[int] = None
+    output_layer_use_bias: bool = True
+    output_layer_activation: str = "linear"
+    output_layer_weights_initializer: Optional[Union[str, Callable]] = None
+    output_layer_weights_initializer_config: Optional[Dict] = None
+    output_layer_bias_initializer: Optional[Union[str, Callable]] = None
+    output_layer_bias_initializer_config: Optional[Dict] = None
+
+    # Optional clipping of log standard deviation.
+    clip_log_std: bool = False
+    # Optional clip parameter for the log standard deviation.
+    log_std_clip_param: float = 20.0
+
+    @property
+    def output_dims(self):
+        if self.output_layer_dim is None and not self.hidden_layer_dims:
+            raise ValueError(
+                "If `output_layer_dim` is None, you must specify at least one hidden "
+                "layer dim, e.g. `hidden_layer_dims=[32]`!"
+            )
+
+        # Infer `output_dims` automatically.
+        return (int(self.output_layer_dim or self.hidden_layer_dims[-1]),)
+
+    def _validate(self, framework: str = "torch"):
+        """Makes sure that settings are valid."""
+        if self.input_dims is not None and len(self.input_dims) != 1:
+            raise ValueError(
+                f"`input_dims` ({self.input_dims}) of MLPConfig must be 1D, "
+                "e.g. `[32]`!"
+            )
+        if len(self.output_dims) != 1:
+            raise ValueError(
+                f"`output_dims` ({self.output_dims}) of _MLPConfig must be "
+                "1D, e.g. `[32]`! This is an inferred value, hence other settings might"
+                " be wrong."
+            )
+        if self.log_std_clip_param is None:
+            raise ValueError(
+                "`log_std_clip_param` of _MLPConfig must be a float value, but is "
+                "`None`."
+            )
+
+        # Call these already here to catch errors early on.
+        get_activation_fn(self.hidden_layer_activation, framework=framework)
+        get_activation_fn(self.output_layer_activation, framework=framework)
+        get_initializer_fn(self.hidden_layer_weights_initializer, framework=framework)
+        get_initializer_fn(self.hidden_layer_bias_initializer, framework=framework)
+        get_initializer_fn(self.output_layer_weights_initializer, framework=framework)
+        get_initializer_fn(self.output_layer_bias_initializer, framework=framework)
+
+
+@ExperimentalAPI
+@dataclass
+class MLPHeadConfig(_MLPConfig):
+    """Configuration for an MLP head.
+
+    See _MLPConfig for usage details.
+
+    Example:
+
+    .. testcode::
+
+        # Configuration:
+        config = MLPHeadConfig(
+            input_dims=[4],  # must be 1D tensor
+            hidden_layer_dims=[8, 8],
+            hidden_layer_activation="relu",
+            hidden_layer_use_layernorm=False,
+            # final output layer with no activation (linear)
+            output_layer_dim=2,
+            output_layer_activation="linear",
+        )
+        model = config.build(framework="tf2")
+
+        # Resulting stack in pseudocode:
+        # Linear(4, 8, bias=True)
+        # ReLU()
+        # Linear(8, 8, bias=True)
+        # ReLU()
+        # Linear(8, 2, bias=True)
+
+    Example:
+
+    .. testcode::
+
+        # Configuration:
+        config = MLPHeadConfig(
+            input_dims=[2],
+            hidden_layer_dims=[10, 4],
+            hidden_layer_activation="silu",
+            hidden_layer_use_layernorm=True,
+            hidden_layer_use_bias=False,
+            # Initializer for `framework="torch"`.
+            hidden_layer_weights_initializer="xavier_normal_",
+            hidden_layer_weights_initializer_config={"gain": 0.8},
+            # No final output layer (use last dim in `hidden_layer_dims`
+            # as the size of the last layer in the stack).
+            output_layer_dim=None,
+        )
+        model = config.build(framework="torch")
+
+        # Resulting stack in pseudocode:
+        # Linear(2, 10, bias=False)
+        # LayerNorm((10,))  # layer norm always before activation
+        # SiLU()
+        # Linear(10, 4, bias=False)
+        # LayerNorm((4,))  # layer norm always before activation
+        # SiLU()
+    """
+
+    @_framework_implemented()
+    def build(self, framework: str = "torch") -> "Model":
+        self._validate(framework=framework)
+
+        if framework == "torch":
+            from ray.rllib.core.models.torch.heads import TorchMLPHead
+
+            return TorchMLPHead(self)
+        else:
+            from ray.rllib.core.models.tf.heads import TfMLPHead
+
+            return TfMLPHead(self)
+
+
+@ExperimentalAPI
+@dataclass
+class FreeLogStdMLPHeadConfig(_MLPConfig):
+    """Configuration for an MLPHead with a floating second half of outputs.
+
+    This model can be useful together with Gaussian Distributions.
+    This gaussian distribution would be conditioned as follows:
+        - The first half of outputs from this model can be used as
+        state-dependent means when conditioning a gaussian distribution
+        - The second half are floating free biases that can be used as
+        state-independent standard deviations to condition a gaussian distribution.
+    The mean values are produced by an MLPHead, while the standard
+    deviations are added as floating free biases from a single 1D trainable variable
+    (not dependent on the net's inputs).
+
+    The output dimensions of the configured MLPHeadConfig must be even and are
+    divided by two to gain the output dimensions of each the mean-net and the
+    free std-variable.
+
+    Example:
+    .. testcode::
+        :skipif: True
+
+        # Configuration:
+        config = FreeLogStdMLPHeadConfig(
+            input_dims=[2],
+            hidden_layer_dims=[16],
+            hidden_layer_activation=None,
+            hidden_layer_use_layernorm=False,
+            hidden_layer_use_bias=True,
+            output_layer_dim=8,  # <- this must be an even size
+            output_layer_use_bias=True,
+        )
+        model = config.build(framework="tf2")
+
+        # Resulting stack in pseudocode:
+        # Linear(2, 16, bias=True)
+        # Linear(8, 8, bias=True)  # 16 / 2 = 8 -> 8 nodes for the mean
+        # Extra variable:
+        # Tensor((8,), float32)  # for the free (observation independent) std outputs
+
+    Example:
+    .. testcode::
+        :skipif: True
+
+        # Configuration:
+        config = FreeLogStdMLPHeadConfig(
+            input_dims=[2],
+            hidden_layer_dims=[31, 100],   # <- last idx must be an even size
+            hidden_layer_activation="relu",
+            hidden_layer_use_layernorm=False,
+            hidden_layer_use_bias=False,
+            output_layer_dim=None,  # use the last hidden layer as output layer
+        )
+        model = config.build(framework="torch")
+
+        # Resulting stack in pseudocode:
+        # Linear(2, 31, bias=False)
+        # ReLu()
+        # Linear(31, 50, bias=False)  # 100 / 2 = 50 -> 50 nodes for the mean
+        # ReLu()
+        # Extra variable:
+        # Tensor((50,), float32)  # for the free (observation independent) std outputs
+    """
+
+    def _validate(self, framework: str = "torch"):
+        if len(self.output_dims) > 1 or self.output_dims[0] % 2 == 1:
+            raise ValueError(
+                f"`output_layer_dim` ({self.ouput_layer_dim}) or the last value in "
+                f"`hidden_layer_dims` ({self.hidden_layer_dims}) of a "
+                "FreeLogStdMLPHeadConfig must be an even int (dividable by 2), "
+                "e.g. `output_layer_dim=8` or `hidden_layer_dims=[133, 128]`!"
+            )
+
+    @_framework_implemented()
+    def build(self, framework: str = "torch") -> "Model":
+        self._validate(framework=framework)
+
+        if framework == "torch":
+            from ray.rllib.core.models.torch.heads import TorchFreeLogStdMLPHead
+
+            return TorchFreeLogStdMLPHead(self)
+        else:
+            from ray.rllib.core.models.tf.heads import TfFreeLogStdMLPHead
+
+            return TfFreeLogStdMLPHead(self)
+
+
+@ExperimentalAPI
+@dataclass
+class CNNTransposeHeadConfig(ModelConfig):
+    """Configuration for a convolutional transpose head (decoder) network.
+
+    The configured Model transforms 1D-observations into an image space.
+    The stack of layers is composed of an initial Dense layer, followed by a sequence
+    of Conv2DTranspose layers.
+    `input_dims` describes the shape of the (1D) input tensor,
+    `initial_image_dims` describes the input into the first Conv2DTranspose
+    layer, where the translation from `input_dim` to `initial_image_dims` is done
+    via the initial Dense layer (w/o activation, w/o layer-norm, and w/ bias).
+
+    Beyond that, each layer specified by `cnn_transpose_filter_specifiers`
+    is followed by an activation function according to `cnn_transpose_activation`.
+
+    `output_dims` is reached after the final Conv2DTranspose layer.
+    Not that the last Conv2DTranspose layer is never activated and never layer-norm'd
+    regardless of the other settings.
+
+    An example for a single conv2d operation is as follows:
+    Input "image" is (4, 4, 24) (not yet strided), padding is "same", stride=2,
+    kernel=5.
+
+    First, the input "image" is strided (with stride=2):
+
+    Input image (4x4 (x24)):
+    A B C D
+    E F G H
+    I J K L
+    M N O P
+
+    Stride with stride=2 -> (7x7 (x24))
+    A 0 B 0 C 0 D
+    0 0 0 0 0 0 0
+    E 0 F 0 G 0 H
+    0 0 0 0 0 0 0
+    I 0 J 0 K 0 L
+    0 0 0 0 0 0 0
+    M 0 N 0 O 0 P
+
+    Then this strided "image" (strided_size=7x7) is padded (exact padding values will be
+    computed by the model):
+
+    Padding -> (left=3, right=2, top=3, bottom=2)
+
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 A 0 B 0 C 0 D 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 E 0 F 0 G 0 H 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 I 0 J 0 K 0 L 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 M 0 N 0 O 0 P 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+
+    Then deconvolution with kernel=5 yields an output "image" of 8x8 (x num output
+    filters).
+
+    Attributes:
+        input_dims: The input dimensions of the network. This must be a 1D tensor.
+        initial_image_dims: The shape of the input to the first
+            Conv2DTranspose layer. We will make sure the input is transformed to
+            these dims via a preceding initial Dense layer, followed by a reshape,
+            before entering the Conv2DTranspose stack.
+        initial_dense_weights_initializer: The initializer function or class to use for
+            weight initialization in the initial dense layer. If `None` the default
+            initializer of the respective dense layer of a framework (`"torch"` or
+            `"tf2"`) is used. Note, all initializers defined in the framework `"tf2`)
+            are allowed. For `"torch"` only the in-place initializers, i.e. ending with
+            an underscore "_" are allowed.
+        initial_dense_weights_initializer_config: Configuration to pass into the
+            initializer defined in `initial_dense_weights_initializer`.
+        initial_dense_bias_initializer: The initializer function or class to use for
+            bias initialization in the initial dense layer. If `None` the default
+            initializer of the respective CNN layer of a framework (`"torch"` or `"tf2"`
+            ) is used. For `"torch"` only the in-place initializers, i.e. ending with an
+            underscore "_" are allowed.
+        initial_dense_bias_initializer_config: Configuration to pass into the
+            initializer defined in `initial_dense_bias_initializer`.
+        cnn_transpose_filter_specifiers: A list of lists, where each element of an inner
+            list contains elements of the form
+            `[number of channels/filters, [kernel width, kernel height], stride]` to
+            specify a convolutional layer stacked in order of the outer list.
+        cnn_transpose_use_bias: Whether to use bias on all Conv2DTranspose layers.
+        cnn_transpose_activation: The activation function to use after each layer
+            (except for the output).
+        cnn_transpose_use_layernorm: Whether to insert a LayerNorm functionality
+            in between each Conv2DTranspose layer's output and its activation.
+        cnn_transpose_kernel_initializer: The initializer function or class to use for
+            kernel initialization in the CNN layers. If `None` the default initializer
+            of the respective CNN layer of a framework (`"torch"` or `"tf2"`) is used.
+            Note, all initializers defined in the framework `"tf2`) are allowed. For
+            `"torch"` only the in-place initializers, i.e. ending with an underscore "_"
+            are allowed.
+        cnn_transpose_kernel_initializer_config: Configuration to pass into the
+            initializer defined in `cnn_transpose_kernel_initializer`.
+        cnn_transpose_bias_initializer: The initializer function or class to use for
+            bias initialization in the CNN layers. If `None` the default initializer of
+            the respective CNN layer of a framework (`"torch"` or `"tf2"`) is used.
+            For `"torch"` only the in-place initializers, i.e. ending with an underscore
+            "_" are allowed.
+        cnn_transpose_bias_initializer_config: Configuration to pass into the
+            initializer defined in `cnn_transpose_bias_initializer`.
+
+    Example:
+    .. testcode::
+        :skipif: True
+
+        # Configuration:
+        config = CNNTransposeHeadConfig(
+            input_dims=[10],  # 1D input vector (possibly coming from another NN)
+            initial_image_dims=[4, 4, 96],  # first image input to deconv stack
+            # Initializer for TensorFlow.
+            initial_dense_weights_initializer="HeNormal",
+            initial_dense_weights_initializer={"seed": 334},
+            cnn_transpose_filter_specifiers=[
+                [48, [4, 4], 2],
+                [24, [4, 4], 2],
+                [3, [4, 4], 2],
+            ],
+            cnn_transpose_activation="silu",  # or "swish", which is the same
+            cnn_transpose_use_layernorm=False,
+            cnn_use_bias=True,
+        )
+        model = config.build(framework="torch)
+
+        # Resulting stack in pseudocode:
+        # Linear(10, 4*4*24)
+        # Conv2DTranspose(
+        #   in_channels=96, out_channels=48,
+        #   kernel_size=[4, 4], stride=2, bias=True,
+        # )
+        # Swish()
+        # Conv2DTranspose(
+        #   in_channels=48, out_channels=24,
+        #   kernel_size=[4, 4], stride=2, bias=True,
+        # )
+        # Swish()
+        # Conv2DTranspose(
+        #   in_channels=24, out_channels=3,
+        #   kernel_size=[4, 4], stride=2, bias=True,
+        # )
+
+    Example:
+    .. testcode::
+        :skipif: True
+
+        # Configuration:
+        config = CNNTransposeHeadConfig(
+            input_dims=[128],  # 1D input vector (possibly coming from another NN)
+            initial_image_dims=[4, 4, 32],  # first image input to deconv stack
+            cnn_transpose_filter_specifiers=[
+                [16, 4, 2],
+                [3, 4, 2],
+            ],
+            cnn_transpose_activation="relu",
+            cnn_transpose_use_layernorm=True,
+            cnn_use_bias=False,
+            # Initializer for `framework="tf2"`.
+            # Note, for Torch only in-place initializers are allowed.
+            cnn_transpose_kernel_initializer="xavier_normal_",
+            cnn_transpose_kernel_initializer_config={"gain": 0.8},
+        )
+        model = config.build(framework="torch)
+
+        # Resulting stack in pseudocode:
+        # Linear(128, 4*4*32, bias=True)  # bias always True for initial dense layer
+        # Conv2DTranspose(
+        #   in_channels=32, out_channels=16,
+        #   kernel_size=[4, 4], stride=2, bias=False,
+        # )
+        # LayerNorm((-3, -2, -1))  # layer normalize over last 3 axes
+        # ReLU()
+        # Conv2DTranspose(
+        #   in_channels=16, out_channels=3,
+        #   kernel_size=[4, 4], stride=2, bias=False,
+        # )
+    """
+
+    input_dims: Union[List[int], Tuple[int]] = None
+    initial_image_dims: Union[List[int], Tuple[int]] = field(
+        default_factory=lambda: [4, 4, 96]
+    )
+    initial_dense_weights_initializer: Optional[Union[str, Callable]] = None
+    initial_dense_weights_initializer_config: Optional[Dict] = None
+    initial_dense_bias_initializer: Optional[Union[str, Callable]] = None
+    initial_dense_bias_initializer_config: Optional[Dict] = None
+    cnn_transpose_filter_specifiers: List[List[Union[int, List[int]]]] = field(
+        default_factory=lambda: [[48, [4, 4], 2], [24, [4, 4], 2], [3, [4, 4], 2]]
+    )
+    cnn_transpose_use_bias: bool = True
+    cnn_transpose_activation: str = "relu"
+    cnn_transpose_use_layernorm: bool = False
+    cnn_transpose_kernel_initializer: Optional[Union[str, Callable]] = None
+    cnn_transpose_kernel_initializer_config: Optional[Dict] = None
+    cnn_transpose_bias_initializer: Optional[Union[str, Callable]] = None
+    cnn_transpose_bias_initializer_config: Optional[Dict] = None
+
+    @property
+    def output_dims(self):
+        # Infer output dims, layer by layer.
+        dims = self.initial_image_dims
+        for filter_spec in self.cnn_transpose_filter_specifiers:
+            # Same padding.
+            num_filters, kernel, stride = filter_spec
+            # Compute stride output size first (striding is performed first in a
+            # conv transpose layer.
+            stride_w, stride_h = (stride, stride) if isinstance(stride, int) else stride
+            dims = [
+                dims[0] * stride_w - (stride_w - 1),
+                dims[1] * stride_h - (stride_h - 1),
+                num_filters,
+            ]
+            # TODO (Sven): Support "valid" padding for Conv2DTranspose layers, too.
+            #  Analogous to Conv2D Layers in a CNNEncoder.
+            # Apply the correct padding. Note that this might be asymetrical, meaning
+            # left padding might be != right padding, same for top/bottom.
+            _, padding_out_size = same_padding_transpose_after_stride(
+                (dims[0], dims[1]), kernel, stride
+            )
+            # Perform conv transpose operation with the kernel.
+            kernel_w, kernel_h = (kernel, kernel) if isinstance(kernel, int) else kernel
+            dims = [
+                padding_out_size[0] - (kernel_w - 1),
+                padding_out_size[1] - (kernel_h - 1),
+                num_filters,
+            ]
+        return tuple(dims)
+
+    def _validate(self, framework: str = "torch"):
+        if len(self.input_dims) != 1:
+            raise ValueError(
+                f"`input_dims` ({self.input_dims}) of CNNTransposeHeadConfig must be a "
+                "3D tensor (image-like) with the dimensions meaning: width x height x "
+                "num_filters, e.g. `[4, 4, 92]`!"
+            )
+
+    @_framework_implemented()
+    def build(self, framework: str = "torch") -> "Model":
+        self._validate(framework)
+
+        if framework == "torch":
+            from ray.rllib.core.models.torch.heads import TorchCNNTransposeHead
+
+            return TorchCNNTransposeHead(self)
+
+        elif framework == "tf2":
+            from ray.rllib.core.models.tf.heads import TfCNNTransposeHead
+
+            return TfCNNTransposeHead(self)
+
+
+@ExperimentalAPI
+@dataclass
+class CNNEncoderConfig(ModelConfig):
+    """Configuration for a convolutional (encoder) network.
+
+    The configured CNN encodes 3D-observations into a latent space.
+    The stack of layers is composed of a sequence of convolutional layers.
+    `input_dims` describes the shape of the input tensor. Beyond that, each layer
+    specified by `filter_specifiers` is followed by an activation function according
+    to `filter_activation`.
+
+    `output_dims` is reached by either the final convolutional layer's output directly
+    OR by flatten this output.
+
+    See ModelConfig for usage details.
+
+    Example:
+
+    .. testcode::
+
+        # Configuration:
+        config = CNNEncoderConfig(
+            input_dims=[84, 84, 3],  # must be 3D tensor (image: w x h x C)
+            cnn_filter_specifiers=[
+                [16, [8, 8], 4],
+                [32, [4, 4], 2],
+            ],
+            cnn_activation="relu",
+            cnn_use_layernorm=False,
+            cnn_use_bias=True,
+        )
+        model = config.build(framework="torch")
+
+        # Resulting stack in pseudocode:
+        # Conv2D(
+        #   in_channels=3, out_channels=16,
+        #   kernel_size=[8, 8], stride=[4, 4], bias=True,
+        # )
+        # ReLU()
+        # Conv2D(
+        #   in_channels=16, out_channels=32,
+        #   kernel_size=[4, 4], stride=[2, 2], bias=True,
+        # )
+        # ReLU()
+        # Conv2D(
+        #   in_channels=32, out_channels=1,
+        #   kernel_size=[1, 1], stride=[1, 1], bias=True,
+        # )
+        # Flatten()
+
+    Attributes:
+        input_dims: The input dimension of the network. These must be given in the
+            form of `(width, height, channels)`.
+        cnn_filter_specifiers: A list in which each element is another (inner) list
+            of either the following forms:
+            `[number of channels/filters, kernel, stride]`
+            OR:
+            `[number of channels/filters, kernel, stride, padding]`, where `padding`
+            can either be "same" or "valid".
+            When using the first format w/o the `padding` specifier, `padding` is "same"
+            by default. Also, `kernel` and `stride` may be provided either as single
+            ints (square) or as a tuple/list of two ints (width- and height dimensions)
+            for non-squared kernel/stride shapes.
+            A good rule of thumb for constructing CNN stacks is:
+            When using padding="same", the input "image" will be reduced in size by
+            the factor `stride`, e.g. input=(84, 84, 3) stride=2 kernel=x padding="same"
+            filters=16 -> output=(42, 42, 16).
+            For example, if you would like to reduce an Atari image from its original
+            (84, 84, 3) dimensions down to (6, 6, F), you can construct the following
+            stack and reduce the w x h dimension of the image by 2 in each layer:
+            [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]] -> output=(6, 6, 128)
+        cnn_use_bias: Whether to use bias on all Conv2D layers.
+        cnn_activation: The activation function to use after each layer (
+            except for the output). The default activation for Conv2d layers is "relu".
+        cnn_use_layernorm: Whether to insert a LayerNorm functionality
+            in between each CNN layer's output and its activation. Note that
+            the output layer.
+        cnn_kernel_initializer: The initializer function or class to use for kernel
+            initialization in the CNN layers. If `None` the default initializer of the
+            respective CNN layer of a framework (`"torch"` or `"tf2"`) is used. Note,
+            all initializers defined in the framework `"tf2`) are allowed. For `"torch"`
+            only the in-place initializers, i.e. ending with an underscore "_" are
+            allowed.
+        cnn_kernel_initializer_config: Configuration to pass into the initializer
+            defined in `cnn_kernel_initializer`.
+        cnn_bias_initializer: The initializer function or class to use for bias
+            initialization in the CNN layers. If `None` the default initializer of
+            the respective CNN layer of a framework (`"torch"` or `"tf2"`) is used.
+            For `"torch"` only the in-place initializers, i.e. ending with an underscore
+            "_" are allowed.
+        cnn_bias_initializer_config: Configuration to pass into the initializer defined
+            in  `cnn_bias_initializer`.
+        flatten_at_end: Whether to flatten the output of the last conv 2D layer into
+            a 1D tensor. By default, this is True. Note that if you set this to False,
+            you might simply stack another CNNEncoder on top of this one (maybe with
+            different activation and bias settings).
+    """
+
+    input_dims: Union[List[int], Tuple[int]] = None
+    cnn_filter_specifiers: List[List[Union[int, List[int]]]] = field(
+        default_factory=lambda: [[16, [4, 4], 2], [32, [4, 4], 2], [64, [8, 8], 2]]
+    )
+    cnn_use_bias: bool = True
+    cnn_activation: str = "relu"
+    cnn_use_layernorm: bool = False
+    cnn_kernel_initializer: Optional[Union[str, Callable]] = None
+    cnn_kernel_initializer_config: Optional[Dict] = None
+    cnn_bias_initializer: Optional[Union[str, Callable]] = None
+    cnn_bias_initializer_config: Optional[Dict] = None
+    flatten_at_end: bool = True
+
+    @property
+    def output_dims(self):
+        if not self.input_dims:
+            return None
+
+        # Infer output dims, layer by layer.
+        dims = self.input_dims  # Creates a copy (works for tuple/list).
+        for filter_spec in self.cnn_filter_specifiers:
+            # Padding not provided, "same" by default.
+            if len(filter_spec) == 3:
+                num_filters, kernel, stride = filter_spec
+                padding = "same"
+            # Padding option provided, use given value.
+            else:
+                num_filters, kernel, stride, padding = filter_spec
+
+            # Same padding.
+            if padding == "same":
+                _, dims = same_padding(dims[:2], kernel, stride)
+            # Valid padding.
+            else:
+                dims = valid_padding(dims[:2], kernel, stride)
+
+            # Add depth (num_filters) to the end (our utility functions for same/valid
+            # only return the image width/height).
+            dims = [dims[0], dims[1], num_filters]
+
+        # Flatten everything.
+        if self.flatten_at_end:
+            return (int(np.prod(dims)),)
+
+        return tuple(dims)
+
+    def _validate(self, framework: str = "torch"):
+        if len(self.input_dims) != 3:
+            raise ValueError(
+                f"`input_dims` ({self.input_dims}) of CNNEncoderConfig must be a 3D "
+                "tensor (image) with the dimensions meaning: width x height x "
+                "channels, e.g. `[64, 64, 3]`!"
+            )
+        if not self.flatten_at_end and len(self.output_dims) != 3:
+            raise ValueError(
+                f"`output_dims` ({self.output_dims}) of CNNEncoderConfig must be "
+                "3D, e.g. `[4, 4, 128]`, b/c your `flatten_at_end` setting is False! "
+                "`output_dims` is an inferred value, hence other settings might be "
+                "wrong."
+            )
+        elif self.flatten_at_end and len(self.output_dims) != 1:
+            raise ValueError(
+                f"`output_dims` ({self.output_dims}) of CNNEncoderConfig must be "
+                "1D, e.g. `[32]`, b/c your `flatten_at_end` setting is True! "
+                "`output_dims` is an inferred value, hence other settings might be "
+                "wrong."
+            )
+
+    @_framework_implemented()
+    def build(self, framework: str = "torch") -> "Model":
+        self._validate(framework)
+
+        if framework == "torch":
+            from ray.rllib.core.models.torch.encoder import TorchCNNEncoder
+
+            return TorchCNNEncoder(self)
+
+        elif framework == "tf2":
+            from ray.rllib.core.models.tf.encoder import TfCNNEncoder
+
+            return TfCNNEncoder(self)
+
+
+@ExperimentalAPI
+@dataclass
+class MLPEncoderConfig(_MLPConfig):
+    """Configuration for an MLP that acts as an encoder.
+
+    See _MLPConfig for usage details.
+
+    Example:
+    .. testcode::
+
+        # Configuration:
+        config = MLPEncoderConfig(
+            input_dims=[4],  # must be 1D tensor
+            hidden_layer_dims=[16],
+            hidden_layer_activation="relu",
+            hidden_layer_use_layernorm=False,
+            output_layer_dim=None,  # maybe None or an int
+        )
+        model = config.build(framework="torch")
+
+        # Resulting stack in pseudocode:
+        # Linear(4, 16, bias=True)
+        # ReLU()
+
+    Example:
+    .. testcode::
+
+        # Configuration:
+        config = MLPEncoderConfig(
+            input_dims=[2],
+            hidden_layer_dims=[8, 8],
+            hidden_layer_activation="silu",
+            hidden_layer_use_layernorm=True,
+            hidden_layer_use_bias=False,
+            output_layer_dim=4,
+            output_layer_activation="tanh",
+            output_layer_use_bias=False,
+        )
+        model = config.build(framework="tf2")
+
+        # Resulting stack in pseudocode:
+        # Linear(2, 8, bias=False)
+        # LayerNorm((8,))  # layernorm always before activation
+        # SiLU()
+        # Linear(8, 8, bias=False)
+        # LayerNorm((8,))  # layernorm always before activation
+        # SiLU()
+        # Linear(8, 4, bias=False)
+        # Tanh()
+    """
+
+    @_framework_implemented()
+    def build(self, framework: str = "torch") -> "Encoder":
+        self._validate(framework)
+
+        if framework == "torch":
+            from ray.rllib.core.models.torch.encoder import TorchMLPEncoder
+
+            return TorchMLPEncoder(self)
+        else:
+            from ray.rllib.core.models.tf.encoder import TfMLPEncoder
+
+            return TfMLPEncoder(self)
+
+
+@ExperimentalAPI
+@dataclass
+class RecurrentEncoderConfig(ModelConfig):
+    """Configuration for an LSTM-based or a GRU-based encoder.
+
+    The encoder consists of...
+    - Zero or one tokenizers
+    - N LSTM/GRU layers stacked on top of each other and feeding
+    their outputs as inputs to the respective next layer.
+
+    This makes for the following flow of tensors:
+
+    Inputs
+    |
+    [Tokenizer if present]
+    |
+    LSTM layer 1
+    |
+    (...)
+    |
+    LSTM layer n
+    |
+    Outputs
+
+    The internal state is structued as (num_layers, B, hidden-size) for all hidden
+    state components, e.g.
+    h- and c-states of the LSTM layer(s) or h-state of the GRU layer(s).
+    For example, the hidden states of an LSTMEncoder with num_layers=2 and hidden_dim=8
+    would be: {"h": (2, B, 8), "c": (2, B, 8)}.
+
+    `output_dims` is reached by the last recurrent layer's dimension, which is always
+    the `hidden_dims` value.
+
+    Example:
+    .. testcode::
+
+        # Configuration:
+        config = RecurrentEncoderConfig(
+            recurrent_layer_type="lstm",
+            input_dims=[16],  # must be 1D tensor
+            hidden_dim=128,
+            num_layers=2,
+            use_bias=True,
+        )
+        model = config.build(framework="torch")
+
+        # Resulting stack in pseudocode:
+        # LSTM(16, 128, bias=True)
+        # LSTM(128, 128, bias=True)
+
+        # Resulting shape of the internal states (c- and h-states):
+        # (2, B, 128) for each c- and h-states.
+
+    Example:
+    .. testcode::
+
+        # Configuration:
+        config = RecurrentEncoderConfig(
+            recurrent_layer_type="gru",
+            input_dims=[32],  # must be 1D tensor
+            hidden_dim=64,
+            num_layers=1,
+            use_bias=False,
+        )
+        model = config.build(framework="torch")
+
+        # Resulting stack in pseudocode:
+        # GRU(32, 64, bias=False)
+
+        # Resulting shape of the internal state:
+        # (1, B, 64)
+
+    Attributes:
+        input_dims: The input dimensions. Must be 1D. This is the 1D shape of the tensor
+            that goes into the first recurrent layer.
+        recurrent_layer_type: The type of the recurrent layer(s).
+            Either "lstm" or "gru".
+        hidden_dim: The size of the hidden internal state(s) of the recurrent layer(s).
+            For example, for an LSTM, this would be the size of the c- and h-tensors.
+        num_layers: The number of recurrent (LSTM or GRU) layers to stack.
+        batch_major: Wether the input is batch major (B, T, ..) or
+            time major (T, B, ..).
+        hidden_weights_initializer: The initializer function or class to use for
+            kernel initialization in the hidden layers. If `None` the default
+            initializer of the respective recurrent layer of a framework (`"torch"` or
+            `"tf2"`) is used. Note, all initializers defined in the frameworks (
+            `"torch"` or `"tf2`) are allowed. For `"torch"` only the in-place
+            initializers, i.e. ending with an underscore "_" are allowed.
+        hidden_weights_initializer_config: Configuration to pass into the
+            initializer defined in `hidden_weights_initializer`.
+        use_bias: Whether to use bias on the recurrent layers in the network.
+        hidden_bias_initializer: The initializer function or class to use for bias
+            initialization in the hidden layers. If `None` the default initializer of
+            the respective recurrent layer of a framework (`"torch"` or `"tf2"`) is
+            used. For `"torch"` only the in-place initializers, i.e. ending with an
+            underscore "_" are allowed.
+        hidden_bias_initializer_config: Configuration to pass into the initializer
+            defined in `hidden_bias_initializer`.
+        tokenizer_config: A ModelConfig to build tokenizers for observations,
+            actions and other spaces.
+    """
+
+    recurrent_layer_type: str = "lstm"
+    hidden_dim: int = None
+    num_layers: int = None
+    batch_major: bool = True
+    hidden_weights_initializer: Optional[Union[str, Callable]] = None
+    hidden_weights_initializer_config: Optional[Dict] = None
+    use_bias: bool = True
+    hidden_bias_initializer: Optional[Union[str, Callable]] = None
+    hidden_bias_initializer_config: Optional[Dict] = None
+    tokenizer_config: ModelConfig = None
+
+    @property
+    def output_dims(self):
+        return (self.hidden_dim,)
+
+    def _validate(self, framework: str = "torch"):
+        """Makes sure that settings are valid."""
+        if self.recurrent_layer_type not in ["gru", "lstm"]:
+            raise ValueError(
+                f"`recurrent_layer_type` ({self.recurrent_layer_type}) of "
+                "RecurrentEncoderConfig must be 'gru' or 'lstm'!"
+            )
+        if self.input_dims is not None and len(self.input_dims) != 1:
+            raise ValueError(
+                f"`input_dims` ({self.input_dims}) of RecurrentEncoderConfig must be "
+                "1D, e.g. `[32]`!"
+            )
+        if len(self.output_dims) != 1:
+            raise ValueError(
+                f"`output_dims` ({self.output_dims}) of RecurrentEncoderConfig must be "
+                "1D, e.g. `[32]`! This is an inferred value, hence other settings might"
+                " be wrong."
+            )
+
+    @_framework_implemented()
+    def build(self, framework: str = "torch") -> "Encoder":
+        if framework == "torch":
+            from ray.rllib.core.models.torch.encoder import (
+                TorchGRUEncoder as GRU,
+                TorchLSTMEncoder as LSTM,
+            )
+        else:
+            from ray.rllib.core.models.tf.encoder import (
+                TfGRUEncoder as GRU,
+                TfLSTMEncoder as LSTM,
+            )
+
+        if self.recurrent_layer_type == "lstm":
+            return LSTM(self)
+        else:
+            return GRU(self)
+
+
+@ExperimentalAPI
+@dataclass
+class ActorCriticEncoderConfig(ModelConfig):
+    """Configuration for an ActorCriticEncoder.
+
+    The base encoder functions like other encoders in RLlib. It is wrapped by the
+    ActorCriticEncoder to provides a shared encoder Model to use in RLModules that
+    provides twofold outputs: one for the actor and one for the critic. See
+    ModelConfig for usage details.
+
+    Attributes:
+        base_encoder_config: The configuration for the wrapped encoder(s).
+        shared: Whether the base encoder is shared between the actor and critic.
+        inference_only: Whether the configured encoder will only ever be used as an
+            actor-encoder, never as a value-function encoder. Thus, if True and `shared`
+            is False, will only build the actor-related components.
+    """
+
+    base_encoder_config: ModelConfig = None
+    shared: bool = True
+    inference_only: bool = False
+
+    @_framework_implemented()
+    def build(self, framework: str = "torch") -> "Encoder":
+        if framework == "torch":
+            from ray.rllib.core.models.torch.encoder import (
+                TorchActorCriticEncoder,
+                TorchStatefulActorCriticEncoder,
+            )
+
+            if isinstance(self.base_encoder_config, RecurrentEncoderConfig):
+                return TorchStatefulActorCriticEncoder(self)
+            else:
+                return TorchActorCriticEncoder(self)
+        else:
+            from ray.rllib.core.models.tf.encoder import (
+                TfActorCriticEncoder,
+                TfStatefulActorCriticEncoder,
+            )
+
+            if isinstance(self.base_encoder_config, RecurrentEncoderConfig):
+                return TfStatefulActorCriticEncoder(self)
+            else:
+                return TfActorCriticEncoder(self)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87b37f3578a6e087d02e841295057720253cef54
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/specs_base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/specs_base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03b9239a3d533d3f17b43579602e2551600268da
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/specs_base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/specs_dict.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/specs_dict.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f636a3a70d3cb27502e12d0b4521ecdf2aaaa5a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/specs_dict.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/typing.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/typing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..034c80350498e09e827375595ec31ef156b40e6a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/__pycache__/typing.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/specs_base.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/specs_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9099da94100237cc6dca66391e6d6bfa772f5544
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/specs_base.py
@@ -0,0 +1,226 @@
+import abc
+from copy import deepcopy
+import numpy as np
+from typing import Any, Optional, Dict, List, Tuple, Union, Type
+from ray.rllib.utils import try_import_jax, try_import_tf, try_import_torch
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.typing import TensorType
+
+torch, _ = try_import_torch()
+_, tf, _ = try_import_tf()
+jax, _ = try_import_jax()
+
+_INVALID_INPUT_DUP_DIM = "Duplicate dimension names in shape ({})"
+_INVALID_INPUT_UNKNOWN_DIM = "Unknown dimension name {} in shape ({})"
+_INVALID_INPUT_POSITIVE = "Dimension {} in ({}) must be positive, got {}"
+_INVALID_INPUT_INT_DIM = "Dimension {} in ({}) must be integer, got {}"
+_INVALID_SHAPE = "Expected shape {} but found {}"
+_INVALID_TYPE = "Expected data type {} but found {}"
+
+
+@Deprecated(
+    help="The Spec checking APIs have been deprecated and cancelled without "
+    "replacement.",
+    error=False,
+)
+class Spec(abc.ABC):
+    @staticmethod
+    @abc.abstractmethod
+    def validate(self, data: Any) -> None:
+        pass
+
+
+@Deprecated(
+    help="The Spec checking APIs have been deprecated and cancelled without "
+    "replacement.",
+    error=False,
+)
+class TypeSpec(Spec):
+    def __init__(self, dtype: Type) -> None:
+        self.dtype = dtype
+
+    def __repr__(self):
+        return f"TypeSpec({str(self.dtype)})"
+
+    def validate(self, data: Any) -> None:
+        if not isinstance(data, self.dtype):
+            raise ValueError(_INVALID_TYPE.format(self.dtype, type(data)))
+
+    def __eq__(self, other: "TypeSpec") -> bool:
+        if not isinstance(other, TypeSpec):
+            return False
+        return self.dtype == other.dtype
+
+    def __ne__(self, other: "TypeSpec") -> bool:
+        return not self == other
+
+
+@Deprecated(
+    help="The Spec checking APIs have been deprecated and cancelled without "
+    "replacement.",
+    error=False,
+)
+class TensorSpec(Spec):
+    def __init__(
+        self,
+        shape: str,
+        *,
+        dtype: Optional[Any] = None,
+        framework: Optional[str] = None,
+        **shape_vals: int,
+    ) -> None:
+        self._expected_shape = self._parse_expected_shape(shape, shape_vals)
+        self._full_shape = self._get_full_shape()
+        self._dtype = dtype
+        self._framework = framework
+
+        if framework not in ("tf2", "torch", "np", "jax", None):
+            raise ValueError(f"Unknown framework {self._framework}")
+
+        self._type = self._get_expected_type()
+
+    def _get_expected_type(self) -> Type:
+        if self._framework == "torch":
+            return torch.Tensor
+        elif self._framework == "tf2":
+            return tf.Tensor
+        elif self._framework == "np":
+            return np.ndarray
+        elif self._framework == "jax":
+            jax, _ = try_import_jax()
+            return jax.numpy.ndarray
+        elif self._framework is None:
+            # Don't restrict the type of the tensor if no framework is specified.
+            return object
+
+    def get_shape(self, tensor: TensorType) -> Tuple[int]:
+        if self._framework == "tf2":
+            return tuple(
+                int(i) if i is not None else None for i in tensor.shape.as_list()
+            )
+        return tuple(tensor.shape)
+
+    def get_dtype(self, tensor: TensorType) -> Any:
+        return tensor.dtype
+
+    @property
+    def dtype(self) -> Any:
+        return self._dtype
+
+    @property
+    def shape(self) -> Tuple[Union[int, str]]:
+        return self._expected_shape
+
+    @property
+    def type(self) -> Type:
+        return self._type
+
+    @property
+    def full_shape(self) -> Tuple[int]:
+        return self._full_shape
+
+    def rdrop(self, n: int) -> "TensorSpec":
+        assert isinstance(n, int) and n >= 0, "n must be a positive integer or zero"
+        copy_ = deepcopy(self)
+        copy_._expected_shape = copy_.shape[:-n]
+        copy_._full_shape = self._get_full_shape()
+        return copy_
+
+    def append(self, spec: "TensorSpec") -> "TensorSpec":
+        copy_ = deepcopy(self)
+        copy_._expected_shape = (*copy_.shape, *spec.shape)
+        copy_._full_shape = self._get_full_shape()
+        return copy_
+
+    def validate(self, tensor: TensorType) -> None:
+        if not isinstance(tensor, self.type):
+            raise ValueError(_INVALID_TYPE.format(self.type, type(tensor).__name__))
+
+        shape = self.get_shape(tensor)
+        if len(shape) != len(self._expected_shape):
+            raise ValueError(_INVALID_SHAPE.format(self._expected_shape, shape))
+
+        for expected_d, actual_d in zip(self._expected_shape, shape):
+            if isinstance(expected_d, int) and expected_d != actual_d:
+                raise ValueError(_INVALID_SHAPE.format(self._expected_shape, shape))
+
+        dtype = tensor.dtype
+        if self.dtype and dtype != self.dtype:
+            raise ValueError(_INVALID_TYPE.format(self.dtype, tensor.dtype))
+
+    def fill(self, fill_value: Union[float, int] = 0) -> TensorType:
+        if self._framework == "torch":
+            return torch.full(self.full_shape, fill_value, dtype=self.dtype)
+
+        elif self._framework == "tf2":
+            if self.dtype:
+                return tf.ones(self.full_shape, dtype=self.dtype) * fill_value
+            return tf.fill(self.full_shape, fill_value)
+
+        elif self._framework == "np":
+            return np.full(self.full_shape, fill_value, dtype=self.dtype)
+
+        elif self._framework == "jax":
+            return jax.numpy.full(self.full_shape, fill_value, dtype=self.dtype)
+
+        elif self._framework is None:
+            raise ValueError(
+                "Cannot fill tensor without providing `framework` to TensorSpec. "
+                "This TensorSpec was instantiated without `framework`."
+            )
+
+    def _get_full_shape(self) -> Tuple[int]:
+        sampled_shape = tuple()
+        for d in self._expected_shape:
+            if isinstance(d, int):
+                sampled_shape += (d,)
+            else:
+                sampled_shape += (1,)
+        return sampled_shape
+
+    def _parse_expected_shape(self, shape: str, shape_vals: Dict[str, int]) -> tuple:
+        d_names = shape.replace(" ", "").split(",")
+        self._validate_shape_vals(d_names, shape_vals)
+
+        expected_shape = tuple(shape_vals.get(d, d) for d in d_names)
+
+        return expected_shape
+
+    def _validate_shape_vals(
+        self, d_names: List[str], shape_vals: Dict[str, int]
+    ) -> None:
+        d_names_set = set(d_names)
+        if len(d_names_set) != len(d_names):
+            raise ValueError(_INVALID_INPUT_DUP_DIM.format(",".join(d_names)))
+
+        for d_name in shape_vals:
+            if d_name not in d_names_set:
+                raise ValueError(
+                    _INVALID_INPUT_UNKNOWN_DIM.format(d_name, ",".join(d_names))
+                )
+
+            d_value = shape_vals.get(d_name, None)
+            if d_value is not None:
+                if not isinstance(d_value, int):
+                    raise ValueError(
+                        _INVALID_INPUT_INT_DIM.format(
+                            d_name, ",".join(d_names), type(d_value)
+                        )
+                    )
+                if d_value <= 0:
+                    raise ValueError(
+                        _INVALID_INPUT_POSITIVE.format(
+                            d_name, ",".join(d_names), d_value
+                        )
+                    )
+
+    def __repr__(self) -> str:
+        return f"TensorSpec(shape={tuple(self.shape)}, dtype={self.dtype})"
+
+    def __eq__(self, other: "TensorSpec") -> bool:
+        if not isinstance(other, TensorSpec):
+            return False
+        return self.shape == other.shape and self.dtype == other.dtype
+
+    def __ne__(self, other: "TensorSpec") -> bool:
+        return not self == other
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/specs_dict.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/specs_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..adc2c46a94122202ac6d371b40fe412560fa257d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/specs_dict.py
@@ -0,0 +1,84 @@
+from typing import Any, Dict
+
+import tree
+from ray.rllib.core.models.specs.specs_base import Spec
+from ray.rllib.utils import force_tuple
+
+
+_MISSING_KEYS_FROM_DATA = (
+    "The data dict does not match the model specs. Keys {} are "
+    "in the spec dict but not on the data dict. Data keys are {}"
+)
+_TYPE_MISMATCH = (
+    "The data does not match the spec. The data element "
+    "{} has type {} (expected type {})."
+)
+
+DATA_TYPE = Dict[str, Any]
+
+IS_NOT_PROPERTY = "Spec {} must be a property of the class {}."
+
+
+class SpecDict(dict, Spec):
+    def validate(
+        self,
+        data: DATA_TYPE,
+        exact_match: bool = False,
+    ) -> None:
+        check = self.is_subset(self, data, exact_match)
+        if not check[0]:
+            data_keys_set = set()
+
+            def _map(path, s):
+                data_keys_set.add(force_tuple(path))
+
+            tree.map_structure_with_path(_map, data)
+
+            raise ValueError(_MISSING_KEYS_FROM_DATA.format(check[1], data_keys_set))
+
+    @staticmethod
+    def is_subset(spec_dict, data_dict, exact_match=False):
+        if exact_match:
+            tree.assert_same_structure(data_dict, spec_dict, check_types=False)
+
+        for key in spec_dict:
+            if key not in data_dict:
+                return False, key
+            if spec_dict[key] is None:
+                continue
+
+            elif isinstance(data_dict[key], dict):
+                if not isinstance(spec_dict[key], dict):
+                    return False, key
+
+                res = SpecDict.is_subset(spec_dict[key], data_dict[key], exact_match)
+                if not res[0]:
+                    return res
+
+            elif isinstance(spec_dict[key], dict):
+                return False, key
+
+            elif isinstance(spec_dict[key], Spec):
+                try:
+                    spec_dict[key].validate(data_dict[key])
+                except ValueError as e:
+                    raise ValueError(
+                        f"Mismatch found in data element {key}, "
+                        f"which is a TensorSpec: {e}"
+                    )
+            elif isinstance(spec_dict[key], (type, tuple)):
+                if not isinstance(data_dict[key], spec_dict[key]):
+                    raise ValueError(
+                        _TYPE_MISMATCH.format(
+                            key,
+                            type(data_dict[key]).__name__,
+                            spec_dict[key].__name__,
+                        )
+                    )
+            else:
+                raise ValueError(
+                    f"The spec type has to be either TensorSpec or Type. "
+                    f"got {type(spec_dict[key])}"
+                )
+
+        return True, None
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/typing.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..3975aae27d8c3ef40e1e4be7c85c9fb7828be260
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/specs/typing.py
@@ -0,0 +1,10 @@
+from typing import Union, Type, Tuple, List, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from ray.rllib.core.models.specs.specs_base import Spec
+
+
+NestedKeys = List[Union[str, Tuple[str, ...]]]
+Constraint = Union[Type, Tuple[Type, ...], "Spec"]
+# Either a flat list of nested keys or a tree of constraints
+SpecType = Union[NestedKeys]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbc40ceb03859fb67d9cccf65378203f78d36d58
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b30597e0183b48aa1588815798952b42babbff82
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/encoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/encoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41c45cddba3c566d8a58d25b3b9dd1be536ec534
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/encoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/heads.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/heads.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe5bd5d7c47e6f100a86ca70714a88704ed98b58
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/heads.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/primitives.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/primitives.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac04c7269a16c38c47f383034e10a88fb03cbb99
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/__pycache__/primitives.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/base.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e346812c42049812555fe5706e638c7519d650
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/base.py
@@ -0,0 +1,53 @@
+import abc
+import logging
+from typing import Tuple
+
+import numpy as np
+
+from ray.rllib.core.models.base import Model
+from ray.rllib.core.models.configs import ModelConfig
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf
+
+logger = logging.getLogger(__name__)
+_, tf, _ = try_import_tf()
+
+
+class TfModel(Model, tf.keras.Model, abc.ABC):
+    """Base class for RLlib's TensorFlow models.
+
+    This class defines the interface for RLlib's TensorFlow models and checks
+    whether inputs and outputs of __call__ are checked with `check_input_specs()` and
+    `check_output_specs()` respectively.
+    """
+
+    def __init__(self, config: ModelConfig):
+        tf.keras.Model.__init__(self)
+        Model.__init__(self, config)
+
+    def call(self, input_dict: dict, **kwargs) -> dict:
+        """Returns the output of this model for the given input.
+
+        This method only makes sure that we have a spec-checked _forward() method.
+
+        Args:
+            input_dict: The input tensors.
+            **kwargs: Forward compatibility kwargs.
+
+        Returns:
+            dict: The output tensors.
+        """
+        return self._forward(input_dict, **kwargs)
+
+    @override(Model)
+    def get_num_parameters(self) -> Tuple[int, int]:
+        return (
+            sum(int(np.prod(w.shape)) for w in self.trainable_weights),
+            sum(int(np.prod(w.shape)) for w in self.non_trainable_weights),
+        )
+
+    @override(Model)
+    def _set_to_dummy_weights(self, value_sequence=(-0.02, -0.01, 0.01, 0.02)):
+        for i, w in enumerate(self.trainable_weights + self.non_trainable_weights):
+            fill_val = value_sequence[i % len(value_sequence)]
+            w.assign(tf.fill(w.shape, fill_val))
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/encoder.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d280e23cda741098a3231403a4e5991cc548e83
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/encoder.py
@@ -0,0 +1,315 @@
+from typing import Dict
+
+import tree  # pip install dm_tree
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.models.base import (
+    Encoder,
+    ActorCriticEncoder,
+    StatefulActorCriticEncoder,
+    ENCODER_OUT,
+    tokenize,
+)
+from ray.rllib.core.models.base import Model
+from ray.rllib.core.models.configs import (
+    ActorCriticEncoderConfig,
+    CNNEncoderConfig,
+    MLPEncoderConfig,
+    RecurrentEncoderConfig,
+)
+from ray.rllib.core.models.tf.base import TfModel
+from ray.rllib.core.models.tf.primitives import TfMLP, TfCNN
+from ray.rllib.models.utils import get_initializer_fn
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class TfActorCriticEncoder(TfModel, ActorCriticEncoder):
+    """An encoder that can hold two encoders."""
+
+    framework = "tf2"
+
+    def __init__(self, config: ActorCriticEncoderConfig) -> None:
+        # We have to call TfModel.__init__ first, because it calls the constructor of
+        # tf.keras.Model, which is required to be called before models are created.
+        TfModel.__init__(self, config)
+        ActorCriticEncoder.__init__(self, config)
+
+
+class TfStatefulActorCriticEncoder(TfModel, StatefulActorCriticEncoder):
+    """A stateful actor-critic encoder for torch."""
+
+    framework = "tf2"
+
+    def __init__(self, config: ActorCriticEncoderConfig) -> None:
+        # We have to call TfModel.__init__ first, because it calls the constructor of
+        # tf.keras.Model, which is required to be called before models are created.
+        TfModel.__init__(self, config)
+        StatefulActorCriticEncoder.__init__(self, config)
+
+
+class TfCNNEncoder(TfModel, Encoder):
+    def __init__(self, config: CNNEncoderConfig) -> None:
+        TfModel.__init__(self, config)
+        Encoder.__init__(self, config)
+
+        # Add an input layer for the Sequential, created below. This is really
+        # important to be able to derive the model's trainable_variables early on
+        # (inside our Learners).
+        layers = [tf.keras.layers.Input(shape=config.input_dims)]
+        # The bare-bones CNN (no flatten, no succeeding dense).
+        cnn = TfCNN(
+            input_dims=config.input_dims,
+            cnn_filter_specifiers=config.cnn_filter_specifiers,
+            cnn_activation=config.cnn_activation,
+            cnn_use_layernorm=config.cnn_use_layernorm,
+            cnn_use_bias=config.cnn_use_bias,
+            cnn_kernel_initializer=config.cnn_kernel_initializer,
+            cnn_kernel_initializer_config=config.cnn_kernel_initializer_config,
+            cnn_bias_initializer=config.cnn_bias_initializer,
+            cnn_bias_initializer_config=config.cnn_bias_initializer_config,
+        )
+        layers.append(cnn)
+
+        # Add a flatten operation to move from 2/3D into 1D space.
+        if config.flatten_at_end:
+            layers.append(tf.keras.layers.Flatten())
+
+        # Create the network from gathered layers.
+        self.net = tf.keras.Sequential(layers)
+
+    @override(Model)
+    def _forward(self, inputs: dict, **kwargs) -> dict:
+        return {ENCODER_OUT: self.net(inputs[Columns.OBS])}
+
+
+class TfMLPEncoder(Encoder, TfModel):
+    def __init__(self, config: MLPEncoderConfig) -> None:
+        TfModel.__init__(self, config)
+        Encoder.__init__(self, config)
+
+        # Create the neural network.
+        self.net = TfMLP(
+            input_dim=config.input_dims[0],
+            hidden_layer_dims=config.hidden_layer_dims,
+            hidden_layer_activation=config.hidden_layer_activation,
+            hidden_layer_use_layernorm=config.hidden_layer_use_layernorm,
+            hidden_layer_use_bias=config.hidden_layer_use_bias,
+            hidden_layer_weights_initializer=config.hidden_layer_weights_initializer,
+            hidden_layer_weights_initializer_config=(
+                config.hidden_layer_weights_initializer_config
+            ),
+            hidden_layer_bias_initializer=config.hidden_layer_bias_initializer,
+            hidden_layer_bias_initializer_config=(
+                config.hidden_layer_bias_initializer_config
+            ),
+            output_dim=config.output_layer_dim,
+            output_activation=config.output_layer_activation,
+            output_use_bias=config.output_layer_use_bias,
+            output_weights_initializer=config.output_layer_weights_initializer,
+            output_weights_initializer_config=(
+                config.output_layer_weights_initializer_config
+            ),
+            output_bias_initializer=config.output_layer_bias_initializer,
+            output_bias_initializer_config=config.output_layer_bias_initializer_config,
+        )
+
+    @override(Model)
+    def _forward(self, inputs: Dict, **kwargs) -> Dict:
+        return {ENCODER_OUT: self.net(inputs[Columns.OBS])}
+
+
+class TfGRUEncoder(TfModel, Encoder):
+    """A recurrent GRU encoder.
+
+    This encoder has...
+    - Zero or one tokenizers.
+    - One or more GRU layers.
+    """
+
+    def __init__(self, config: RecurrentEncoderConfig) -> None:
+        TfModel.__init__(self, config)
+
+        # Maybe create a tokenizer
+        if config.tokenizer_config is not None:
+            self.tokenizer = config.tokenizer_config.build(framework="tf2")
+            # For our first input dim, we infer from the tokenizer.
+            # This is necessary because we need to build the layers in order to be
+            # able to get/set weights directly after instantiation.
+            input_dims = (1,) + tuple(
+                self.tokenizer.output_specs[ENCODER_OUT].full_shape
+            )
+        else:
+            self.tokenizer = None
+            input_dims = (
+                1,
+                1,
+            ) + tuple(config.input_dims)
+
+        gru_weights_initializer = get_initializer_fn(
+            config.hidden_weights_initializer, framework="tf2"
+        )
+        gru_bias_initializer = get_initializer_fn(
+            config.hidden_bias_initializer, framework="tf2"
+        )
+
+        # Create the tf GRU layers.
+        self.grus = []
+        for _ in range(config.num_layers):
+            layer = tf.keras.layers.GRU(
+                config.hidden_dim,
+                time_major=not config.batch_major,
+                # Note, if the initializer is `None`, we want TensorFlow
+                # to use its default one. So we pass in `None`.
+                kernel_initializer=(
+                    gru_weights_initializer(**config.hidden_weights_initializer_config)
+                    if config.hidden_weights_initializer_config
+                    else gru_weights_initializer
+                ),
+                use_bias=config.use_bias,
+                bias_initializer=(
+                    gru_bias_initializer(**config.hidden_bias_initializer_config)
+                    if config.hidden_bias_initializer_config
+                    else gru_bias_initializer
+                ),
+                return_sequences=True,
+                return_state=True,
+            )
+            layer.build(input_dims)
+            input_dims = (1, 1, config.hidden_dim)
+            self.grus.append(layer)
+
+    @override(Model)
+    def get_initial_state(self):
+        return {
+            "h": tf.zeros((self.config.num_layers, self.config.hidden_dim)),
+        }
+
+    @override(Model)
+    def _forward(self, inputs: Dict, **kwargs) -> Dict:
+        outputs = {}
+
+        if self.tokenizer is not None:
+            # Push observations through the tokenizer encoder if we built one.
+            out = tokenize(self.tokenizer, inputs, framework="tf2")
+        else:
+            # Otherwise, just use the raw observations.
+            out = tf.cast(inputs[Columns.OBS], tf.float32)
+
+        # States are batch-first when coming in. Make them layers-first.
+        states_in = tree.map_structure(
+            lambda s: tf.transpose(s, perm=[1, 0] + list(range(2, len(s.shape)))),
+            inputs[Columns.STATE_IN],
+        )
+
+        states_out = []
+        for i, layer in enumerate(self.grus):
+            out, h = layer(out, states_in["h"][i])
+            states_out.append(h)
+
+        # Insert them into the output dict.
+        outputs[ENCODER_OUT] = out
+        outputs[Columns.STATE_OUT] = {"h": tf.stack(states_out, 1)}
+        return outputs
+
+
+class TfLSTMEncoder(TfModel, Encoder):
+    """A recurrent LSTM encoder.
+
+    This encoder has...
+    - Zero or one tokenizers.
+    - One or more LSTM layers.
+    """
+
+    def __init__(self, config: RecurrentEncoderConfig) -> None:
+        TfModel.__init__(self, config)
+
+        # Maybe create a tokenizer
+        if config.tokenizer_config is not None:
+            self.tokenizer = config.tokenizer_config.build(framework="tf2")
+            # For our first input dim, we infer from the tokenizer.
+            # This is necessary because we need to build the layers in order to be
+            # able to get/set weights directly after instantiation.
+            input_dims = (1,) + tuple(
+                self.tokenizer.output_specs[ENCODER_OUT].full_shape
+            )
+        else:
+            self.tokenizer = None
+            input_dims = (
+                1,
+                1,
+            ) + tuple(config.input_dims)
+
+        lstm_weights_initializer = get_initializer_fn(
+            config.hidden_weights_initializer, framework="tf2"
+        )
+        lstm_bias_initializer = get_initializer_fn(
+            config.hidden_bias_initializer, framework="tf2"
+        )
+
+        # Create the tf LSTM layers.
+        self.lstms = []
+        for _ in range(config.num_layers):
+            layer = tf.keras.layers.LSTM(
+                config.hidden_dim,
+                time_major=not config.batch_major,
+                # Note, if the initializer is `None`, we want TensorFlow
+                # to use its default one. So we pass in `None`.
+                kernel_initializer=(
+                    lstm_weights_initializer(**config.hidden_weights_initializer_config)
+                    if config.hidden_weights_initializer_config
+                    else lstm_weights_initializer
+                ),
+                use_bias=config.use_bias,
+                bias_initializer=(
+                    lstm_bias_initializer(**config.hidden_bias_initializer_config)
+                    if config.hidden_bias_initializer_config
+                    else "zeros"
+                ),
+                return_sequences=True,
+                return_state=True,
+            )
+            layer.build(input_dims)
+            input_dims = (1, 1, config.hidden_dim)
+            self.lstms.append(layer)
+
+    @override(Model)
+    def get_initial_state(self):
+        return {
+            "h": tf.zeros((self.config.num_layers, self.config.hidden_dim)),
+            "c": tf.zeros((self.config.num_layers, self.config.hidden_dim)),
+        }
+
+    @override(Model)
+    def _forward(self, inputs: Dict, **kwargs) -> Dict:
+        outputs = {}
+
+        if self.tokenizer is not None:
+            # Push observations through the tokenizer encoder if we built one.
+            out = tokenize(self.tokenizer, inputs, framework="tf2")
+        else:
+            # Otherwise, just use the raw observations.
+            out = tf.cast(inputs[Columns.OBS], tf.float32)
+
+        # States are batch-first when coming in. Make them layers-first.
+        states_in = tree.map_structure(
+            lambda s: tf.transpose(s, perm=[1, 0, 2]),
+            inputs[Columns.STATE_IN],
+        )
+
+        states_out_h = []
+        states_out_c = []
+        for i, layer in enumerate(self.lstms):
+            out, h, c = layer(out, (states_in["h"][i], states_in["c"][i]))
+            states_out_h.append(h)
+            states_out_c.append(c)
+
+        # Insert them into the output dict.
+        outputs[ENCODER_OUT] = out
+        outputs[Columns.STATE_OUT] = {
+            "h": tf.stack(states_out_h, 1),
+            "c": tf.stack(states_out_c, 1),
+        }
+        return outputs
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/heads.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..e92ee5e0577eca5de10e12584338e88b94dbde06
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/heads.py
@@ -0,0 +1,198 @@
+import numpy as np
+
+from ray.rllib.core.models.base import Model
+from ray.rllib.core.models.configs import (
+    CNNTransposeHeadConfig,
+    FreeLogStdMLPHeadConfig,
+    MLPHeadConfig,
+)
+from ray.rllib.core.models.tf.base import TfModel
+from ray.rllib.core.models.tf.primitives import TfCNNTranspose, TfMLP
+from ray.rllib.models.utils import get_initializer_fn
+from ray.rllib.utils import try_import_tf
+from ray.rllib.utils.annotations import override
+
+tf1, tf, tfv = try_import_tf()
+
+
+class TfMLPHead(TfModel):
+    def __init__(self, config: MLPHeadConfig) -> None:
+        TfModel.__init__(self, config)
+
+        self.net = TfMLP(
+            input_dim=config.input_dims[0],
+            hidden_layer_dims=config.hidden_layer_dims,
+            hidden_layer_activation=config.hidden_layer_activation,
+            hidden_layer_use_layernorm=config.hidden_layer_use_layernorm,
+            hidden_layer_use_bias=config.hidden_layer_use_bias,
+            hidden_layer_weights_initializer=config.hidden_layer_weights_initializer,
+            hidden_layer_weights_initializer_config=(
+                config.hidden_layer_weights_initializer_config
+            ),
+            hidden_layer_bias_initializer=config.hidden_layer_bias_initializer,
+            hidden_layer_bias_initializer_config=(
+                config.hidden_layer_bias_initializer_config
+            ),
+            output_dim=config.output_layer_dim,
+            output_activation=config.output_layer_activation,
+            output_use_bias=config.output_layer_use_bias,
+            output_weights_initializer=config.output_layer_weights_initializer,
+            output_weights_initializer_config=(
+                config.output_layer_weights_initializer_config
+            ),
+            output_bias_initializer=config.output_layer_bias_initializer,
+            output_bias_initializer_config=config.output_layer_bias_initializer_config,
+        )
+        # If log standard deviations should be clipped. This should be only true for
+        # policy heads. Value heads should never be clipped.
+        self.clip_log_std = config.clip_log_std
+        # The clipping parameter for the log standard deviation.
+        self.log_std_clip_param = tf.constant([config.log_std_clip_param])
+
+    @override(Model)
+    def _forward(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
+        # Only clip the log standard deviations, if the user wants to clip. This
+        # avoids also clipping value heads.
+        if self.clip_log_std:
+            # Forward pass.
+            means, log_stds = tf.split(self.net(inputs), num_or_size_splits=2, axis=-1)
+            # Clip the log standard deviations.
+            log_stds = tf.clip_by_value(
+                log_stds, -self.log_std_clip_param, self.log_std_clip_param
+            )
+            return tf.concat([means, log_stds], axis=-1)
+        # Otherwise just return the logits.
+        else:
+            return self.net(inputs)
+
+
+class TfFreeLogStdMLPHead(TfModel):
+    """An MLPHead that implements floating log stds for Gaussian distributions."""
+
+    def __init__(self, config: FreeLogStdMLPHeadConfig) -> None:
+        TfModel.__init__(self, config)
+
+        assert config.output_dims[0] % 2 == 0, "output_dims must be even for free std!"
+        self._half_output_dim = config.output_dims[0] // 2
+
+        self.net = TfMLP(
+            input_dim=config.input_dims[0],
+            hidden_layer_dims=config.hidden_layer_dims,
+            hidden_layer_activation=config.hidden_layer_activation,
+            hidden_layer_use_layernorm=config.hidden_layer_use_layernorm,
+            hidden_layer_use_bias=config.hidden_layer_use_bias,
+            hidden_layer_weights_initializer=config.hidden_layer_weights_initializer,
+            hidden_layer_weights_initializer_config=(
+                config.hidden_layer_weights_initializer_config
+            ),
+            hidden_layer_bias_initializer=config.hidden_layer_bias_initializer,
+            hidden_layer_bias_initializer_config=(
+                config.hidden_layer_bias_initializer_config
+            ),
+            output_dim=self._half_output_dim,
+            output_activation=config.output_layer_activation,
+            output_use_bias=config.output_layer_use_bias,
+            output_weights_initializer=config.output_layer_weights_initializer,
+            output_weights_initializer_config=(
+                config.output_layer_weights_initializer_config
+            ),
+            output_bias_initializer=config.output_layer_bias_initializer,
+            output_bias_initializer_config=config.output_layer_bias_initializer_config,
+        )
+
+        self.log_std = tf.Variable(
+            tf.zeros(self._half_output_dim),
+            name="log_std",
+            dtype=tf.float32,
+            trainable=True,
+        )
+        # If log standard deviations should be clipped. This should be only true for
+        # policy heads. Value heads should never be clipped.
+        self.clip_log_std = config.clip_log_std
+        # The clipping parameter for the log standard deviation.
+        self.log_std_clip_param = tf.constant([config.log_std_clip_param])
+
+    @override(Model)
+    def _forward(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
+        # Compute the mean first, then append the log_std.
+        mean = self.net(inputs)
+        # If log standard deviation should be clipped.
+        if self.clip_log_std:
+            # Clip log standard deviations to stabilize training. Note, the
+            # default clip value is `inf`, i.e. no clipping.
+            log_std = tf.clip_by_value(
+                self.log_std, -self.log_std_clip_param, self.log_std_clip_param
+            )
+        else:
+            log_std = self.log_std
+        log_std_out = tf.tile(tf.expand_dims(log_std, 0), [tf.shape(inputs)[0], 1])
+        logits_out = tf.concat([mean, log_std_out], axis=1)
+        return logits_out
+
+
+class TfCNNTransposeHead(TfModel):
+    def __init__(self, config: CNNTransposeHeadConfig) -> None:
+        super().__init__(config)
+
+        # Initial, inactivated Dense layer (always w/ bias). Use the
+        # hidden layer initializer for this layer.
+        initial_dense_weights_initializer = get_initializer_fn(
+            config.initial_dense_weights_initializer, framework="tf2"
+        )
+        initial_dense_bias_initializer = get_initializer_fn(
+            config.initial_dense_bias_initializer, framework="tf2"
+        )
+
+        # This layer is responsible for getting the incoming tensor into a proper
+        # initial image shape (w x h x filters) for the suceeding Conv2DTranspose stack.
+        self.initial_dense = tf.keras.layers.Dense(
+            units=int(np.prod(config.initial_image_dims)),
+            activation=None,
+            kernel_initializer=(
+                initial_dense_weights_initializer(
+                    **config.initial_dense_weights_initializer_config
+                )
+                if config.initial_dense_weights_initializer_config
+                else initial_dense_weights_initializer
+            ),
+            use_bias=True,
+            bias_initializer=(
+                initial_dense_bias_initializer(
+                    **config.initial_dense_bias_initializer_config
+                )
+                if config.initial_dense_bias_initializer_config
+                else initial_dense_bias_initializer
+            ),
+        )
+
+        # The main CNNTranspose stack.
+        self.cnn_transpose_net = TfCNNTranspose(
+            input_dims=config.initial_image_dims,
+            cnn_transpose_filter_specifiers=config.cnn_transpose_filter_specifiers,
+            cnn_transpose_activation=config.cnn_transpose_activation,
+            cnn_transpose_use_layernorm=config.cnn_transpose_use_layernorm,
+            cnn_transpose_use_bias=config.cnn_transpose_use_bias,
+            cnn_transpose_kernel_initializer=config.cnn_transpose_kernel_initializer,
+            cnn_transpose_kernel_initializer_config=(
+                config.cnn_transpose_kernel_initializer_config
+            ),
+            cnn_transpose_bias_initializer=config.cnn_transpose_bias_initializer,
+            cnn_transpose_bias_initializer_config=(
+                config.cnn_transpose_bias_initializer_config
+            ),
+        )
+
+    @override(Model)
+    def _forward(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
+        # Push through initial dense layer to get dimensions of first "image".
+        out = self.initial_dense(inputs)
+        # Reshape to initial 3D (image-like) format to enter CNN transpose stack.
+        out = tf.reshape(
+            out,
+            shape=(-1,) + tuple(self.config.initial_image_dims),
+        )
+        # Push through CNN transpose stack.
+        out = self.cnn_transpose_net(out)
+        # Add 0.5 to center the (always non-activated, non-normalized) outputs more
+        # around 0.0.
+        return out + 0.5
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/primitives.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/primitives.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c5d61bf4f4964f648a4d9ae09f72e64f49072b7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/tf/primitives.py
@@ -0,0 +1,429 @@
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+from ray.rllib.models.utils import get_activation_fn, get_initializer_fn
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class TfMLP(tf.keras.Model):
+    """A multi-layer perceptron with N dense layers.
+
+    All layers (except for an optional additional extra output layer) share the same
+    activation function, bias setup (use bias or not), and LayerNorm setup
+    (use layer normalization or not).
+
+    If `output_dim` (int) is not None, an additional, extra output dense layer is added,
+    which might have its own activation function (e.g. "linear"). However, the output
+    layer does NOT use layer normalization.
+    """
+
+    def __init__(
+        self,
+        *,
+        input_dim: int,
+        hidden_layer_dims: List[int],
+        hidden_layer_use_layernorm: bool = False,
+        hidden_layer_use_bias: bool = True,
+        hidden_layer_activation: Optional[Union[str, Callable]] = "relu",
+        hidden_layer_weights_initializer: Optional[Union[str, Callable]] = None,
+        hidden_layer_weights_initializer_config: Optional[Dict] = None,
+        hidden_layer_bias_initializer: Optional[Union[str, Callable]] = None,
+        hidden_layer_bias_initializer_config: Optional[Dict] = None,
+        output_dim: Optional[int] = None,
+        output_use_bias: bool = True,
+        output_activation: Optional[Union[str, Callable]] = "linear",
+        output_weights_initializer: Optional[Union[str, Callable]] = None,
+        output_weights_initializer_config: Optional[Dict] = None,
+        output_bias_initializer: Optional[Union[str, Callable]] = None,
+        output_bias_initializer_config: Optional[Dict] = None,
+    ):
+        """Initialize a TfMLP object.
+
+        Args:
+            input_dim: The input dimension of the network. Must not be None.
+            hidden_layer_dims: The sizes of the hidden layers. If an empty list, only a
+                single layer will be built of size `output_dim`.
+            hidden_layer_use_layernorm: Whether to insert a LayerNormalization
+                functionality in between each hidden layer's output and its activation.
+            hidden_layer_use_bias: Whether to use bias on all dense layers (excluding
+                the possible separate output layer).
+            hidden_layer_activation: The activation function to use after each layer
+                (except for the output). Either a tf.nn.[activation fn] callable or a
+                string that's supported by tf.keras.layers.Activation(activation=...),
+                e.g. "relu", "ReLU", "silu", or "linear".
+            hidden_layer_weights_initializer: The initializer function or class to use
+                for weights initialization in the hidden layers. If `None` the default
+                initializer of the respective dense layer is used. Note, all
+                initializers defined in `tf.keras.initializers` are allowed.
+            hidden_layer_weights_initializer_config: Configuration to pass into the
+                initializer defined in `hidden_layer_weights_initializer`.
+            hidden_layer_bias_initializer: The initializer function or class to use for
+                bias initialization in the hidden layers. If `None` the default
+                initializer of the respective dense layer is used. Note, all
+                initializers defined in `tf.keras.initializers` are allowed.
+            hidden_layer_bias_initializer_config: Configuration to pass into the
+                initializer defined in `hidden_layer_bias_initializer`.
+            output_dim: The output dimension of the network. If None, no specific output
+                layer will be added and the last layer in the stack will have
+                size=`hidden_layer_dims[-1]`.
+            output_use_bias: Whether to use bias on the separate output layer,
+                if any.
+            output_activation: The activation function to use for the output layer
+                (if any). Either a tf.nn.[activation fn] callable or a string that's
+                supported by tf.keras.layers.Activation(activation=...), e.g. "relu",
+                "ReLU", "silu", or "linear".
+            output_layer_weights_initializer: The initializer function or class to use
+                for weights initialization in the output layers. If `None` the default
+                initializer of the respective dense layer is used. Note, all
+                initializers defined in `tf.keras.initializers` are allowed.
+            output_layer_weights_initializer_config: Configuration to pass into the
+                initializer defined in `output_layer_weights_initializer`.
+            output_layer_bias_initializer: The initializer function or class to use for
+                bias initialization in the output layers. If `None` the default
+                initializer of the respective dense layer is used. Note, all
+                initializers defined in `tf.keras.initializers` are allowed.
+            output_layer_bias_initializer_config: Configuration to pass into the
+                initializer defined in `output_layer_bias_initializer`.
+        """
+        super().__init__()
+        assert input_dim > 0
+
+        layers = []
+        # Input layer.
+        layers.append(tf.keras.Input(shape=(input_dim,)))
+
+        hidden_activation = get_activation_fn(hidden_layer_activation, framework="tf2")
+        hidden_weights_initializer = get_initializer_fn(
+            hidden_layer_weights_initializer, framework="tf2"
+        )
+        hidden_bias_initializer = get_initializer_fn(
+            hidden_layer_bias_initializer, framework="tf2"
+        )
+
+        for i in range(len(hidden_layer_dims)):
+            # Dense layer with activation (or w/o in case we use LayerNorm, in which
+            # case the activation is applied after the layer normalization step).
+            layers.append(
+                tf.keras.layers.Dense(
+                    hidden_layer_dims[i],
+                    activation=(
+                        hidden_activation if not hidden_layer_use_layernorm else None
+                    ),
+                    # Note, if the initializer is `None`, we want TensorFlow
+                    # to use its default one. So we pass in `None`.
+                    kernel_initializer=(
+                        hidden_weights_initializer(
+                            **hidden_layer_weights_initializer_config
+                        )
+                        if hidden_layer_weights_initializer_config
+                        else hidden_weights_initializer
+                    ),
+                    use_bias=hidden_layer_use_bias,
+                    bias_initializer=(
+                        hidden_bias_initializer(**hidden_layer_bias_initializer_config)
+                        if hidden_layer_bias_initializer_config
+                        else hidden_bias_initializer
+                    ),
+                )
+            )
+            # Add LayerNorm and activation.
+            if hidden_layer_use_layernorm:
+                # Use epsilon=1e-5 here (instead of default 1e-3) to be unified
+                # with torch.
+                layers.append(tf.keras.layers.LayerNormalization(epsilon=1e-5))
+                layers.append(tf.keras.layers.Activation(hidden_activation))
+
+        output_weights_initializer = get_initializer_fn(
+            output_weights_initializer, framework="tf2"
+        )
+        output_bias_initializer = get_initializer_fn(
+            output_bias_initializer, framework="tf2"
+        )
+
+        if output_dim is not None:
+            output_activation = get_activation_fn(output_activation, framework="tf2")
+            layers.append(
+                tf.keras.layers.Dense(
+                    output_dim,
+                    activation=output_activation,
+                    # Note, if the initializer is `None`, we want TensorFlow
+                    # to use its default one. So we pass in `None`.
+                    kernel_initializer=(
+                        output_weights_initializer(**output_weights_initializer_config)
+                        if output_weights_initializer_config
+                        else output_weights_initializer
+                    ),
+                    use_bias=output_use_bias,
+                    bias_initializer=(
+                        output_bias_initializer(**output_bias_initializer_config)
+                        if output_bias_initializer_config
+                        else output_bias_initializer
+                    ),
+                )
+            )
+
+        self.network = tf.keras.Sequential(layers)
+
+    def call(self, inputs, **kwargs):
+        return self.network(inputs)
+
+
+class TfCNN(tf.keras.Model):
+    """A model containing a CNN with N Conv2D layers.
+
+    All layers share the same activation function, bias setup (use bias or not), and
+    LayerNormalization setup (use layer normalization or not).
+
+    Note that there is no flattening nor an additional dense layer at the end of the
+    stack. The output of the network is a 3D tensor of dimensions [width x height x num
+    output filters].
+    """
+
+    def __init__(
+        self,
+        *,
+        input_dims: Union[List[int], Tuple[int]],
+        cnn_filter_specifiers: List[List[Union[int, List]]],
+        cnn_use_bias: bool = True,
+        cnn_use_layernorm: bool = False,
+        cnn_activation: Optional[str] = "relu",
+        cnn_kernel_initializer: Optional[Union[str, Callable]] = None,
+        cnn_kernel_initializer_config: Optional[Dict] = None,
+        cnn_bias_initializer: Optional[Union[str, Callable]] = None,
+        cnn_bias_initializer_config: Optional[Dict] = None,
+    ):
+        """Initializes a TfCNN instance.
+
+        Args:
+            input_dims: The 3D input dimensions of the network (incoming image).
+            cnn_filter_specifiers: A list in which each element is another (inner) list
+                of either the following forms:
+                `[number of channels/filters, kernel, stride]`
+                OR:
+                `[number of channels/filters, kernel, stride, padding]`, where `padding`
+                can either be "same" or "valid".
+                When using the first format w/o the `padding` specifier, `padding` is
+                "same" by default. Also, `kernel` and `stride` may be provided either as
+                single ints (square) or as a tuple/list of two ints (width- and height
+                dimensions) for non-squared kernel/stride shapes.
+                A good rule of thumb for constructing CNN stacks is:
+                When using padding="same", the input "image" will be reduced in size by
+                the factor `stride`, e.g. input=(84, 84, 3) stride=2 kernel=x
+                padding="same" filters=16 -> output=(42, 42, 16).
+                For example, if you would like to reduce an Atari image from its
+                original (84, 84, 3) dimensions down to (6, 6, F), you can construct the
+                following stack and reduce the w x h dimension of the image by 2 in each
+                layer:
+                [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]] -> output=(6, 6, 128)
+            cnn_use_bias: Whether to use bias on all Conv2D layers.
+            cnn_activation: The activation function to use after each Conv2D layer.
+            cnn_use_layernorm: Whether to insert a LayerNormalization functionality
+                in between each Conv2D layer's outputs and its activation.
+            cnn_kernel_initializer: The initializer function or class to use for kernel
+                initialization in the CNN layers. If `None` the default initializer of
+                the respective CNN layer is used. Note, all initializers defined in
+                `tf.keras.initializers` are allowed.
+            cnn_kernel_initializer_config: Configuration to pass into the initializer
+                defined in `cnn_kernel_initializer`.
+            cnn_bias_initializer: The initializer function or class to use for bias
+                initialization in the CNN layers. If `None` the default initializer of
+                the respective CNN layer is used. Note, all initializers defined in
+                `tf.keras.initializers` are allowed.
+            cnn_bias_initializer_config: Configuration to pass into the initializer
+                defined in `cnn_bias_initializer`.
+        """
+        super().__init__()
+
+        assert len(input_dims) == 3
+
+        cnn_activation = get_activation_fn(cnn_activation, framework="tf2")
+        cnn_kernel_initializer = get_initializer_fn(
+            cnn_kernel_initializer, framework="tf2"
+        )
+        cnn_bias_initializer = get_initializer_fn(cnn_bias_initializer, framework="tf2")
+
+        layers = []
+
+        # Input layer.
+        layers.append(tf.keras.layers.Input(shape=input_dims))
+
+        for filter_specs in cnn_filter_specifiers:
+            # Padding information not provided -> Use "same" as default.
+            if len(filter_specs) == 3:
+                num_filters, kernel_size, strides = filter_specs
+                padding = "same"
+            # Padding information provided.
+            else:
+                num_filters, kernel_size, strides, padding = filter_specs
+
+            layers.append(
+                tf.keras.layers.Conv2D(
+                    filters=num_filters,
+                    kernel_size=kernel_size,
+                    strides=strides,
+                    padding=padding,
+                    use_bias=cnn_use_bias,
+                    activation=None if cnn_use_layernorm else cnn_activation,
+                    # Note, if the initializer is `None`, we want TensorFlow
+                    # to use its default one. So we pass in `None`.
+                    kernel_initializer=(
+                        cnn_kernel_initializer(**cnn_kernel_initializer_config)
+                        if cnn_kernel_initializer_config
+                        else cnn_kernel_initializer
+                    ),
+                    bias_initializer=(
+                        cnn_bias_initializer(**cnn_bias_initializer_config)
+                        if cnn_bias_initializer_config
+                        else cnn_bias_initializer
+                    ),
+                )
+            )
+            if cnn_use_layernorm:
+                # Use epsilon=1e-5 here (instead of default 1e-3) to be unified with
+                # torch. Need to normalize over all axes.
+                layers.append(
+                    tf.keras.layers.LayerNormalization(axis=[-3, -2, -1], epsilon=1e-5)
+                )
+                layers.append(tf.keras.layers.Activation(cnn_activation))
+
+        # Create the final CNN network.
+        self.cnn = tf.keras.Sequential(layers)
+
+        self.expected_input_dtype = tf.float32
+
+    def call(self, inputs, **kwargs):
+        return self.cnn(tf.cast(inputs, self.expected_input_dtype))
+
+
+class TfCNNTranspose(tf.keras.Model):
+    """A model containing a CNNTranspose with N Conv2DTranspose layers.
+
+    All layers share the same activation function, bias setup (use bias or not), and
+    LayerNormalization setup (use layer normalization or not), except for the last one,
+    which is never activated and never layer norm'd.
+
+    Note that there is no reshaping/flattening nor an additional dense layer at the
+    beginning or end of the stack. The input as well as output of the network are 3D
+    tensors of dimensions [width x height x num output filters].
+    """
+
+    def __init__(
+        self,
+        *,
+        input_dims: Union[List[int], Tuple[int]],
+        cnn_transpose_filter_specifiers: List[List[Union[int, List]]],
+        cnn_transpose_use_bias: bool = True,
+        cnn_transpose_activation: Optional[str] = "relu",
+        cnn_transpose_use_layernorm: bool = False,
+        cnn_transpose_kernel_initializer: Optional[Union[str, Callable]] = None,
+        cnn_transpose_kernel_initializer_config: Optional[Dict] = None,
+        cnn_transpose_bias_initializer: Optional[Union[str, Callable]] = None,
+        cnn_transpose_bias_initializer_config: Optional[Dict] = None,
+    ):
+        """Initializes a TfCNNTranspose instance.
+
+        Args:
+            input_dims: The 3D input dimensions of the network (incoming image).
+            cnn_transpose_filter_specifiers: A list of lists, where each item represents
+                one Conv2DTranspose layer. Each such Conv2DTranspose layer is further
+                specified by the elements of the inner lists. The inner lists follow
+                the format: `[number of filters, kernel, stride]` to
+                specify a convolutional-transpose layer stacked in order of the
+                outer list.
+                `kernel` as well as `stride` might be provided as width x height tuples
+                OR as single ints representing both dimension (width and height)
+                in case of square shapes.
+            cnn_transpose_use_bias: Whether to use bias on all Conv2DTranspose layers.
+            cnn_transpose_use_layernorm: Whether to insert a LayerNormalization
+                functionality in between each Conv2DTranspose layer's outputs and its
+                activation.
+                The last Conv2DTranspose layer will not be normed, regardless.
+            cnn_transpose_activation: The activation function to use after each layer
+                (except for the last Conv2DTranspose layer, which is always
+                non-activated).
+            cnn_transpose_kernel_initializer: The initializer function or class to use
+                for kernel initialization in the CNN layers. If `None` the default
+                initializer of the respective CNN layer is used. Note, all initializers
+                defined in `tf.keras.initializers` are allowed.
+            cnn_transpose_kernel_initializer_config: Configuration to pass into the
+                initializer defined in `cnn_transpose_kernel_initializer`.
+            cnn_transpose_bias_initializer: The initializer function or class to use for
+                bias initialization in the CNN layers. If `None` the default initializer
+                of the respective CNN layer is used. Note, only the in-place
+                initializers, i.e. ending with an underscore "_" are allowed.
+            cnn_transpose_bias_initializer_config: Configuration to pass into the
+                initializer defined in `cnn_transpose_bias_initializer`.
+        """
+        super().__init__()
+
+        assert len(input_dims) == 3
+
+        cnn_transpose_activation = get_activation_fn(
+            cnn_transpose_activation, framework="tf2"
+        )
+        cnn_transpose_kernel_initializer = get_initializer_fn(
+            cnn_transpose_kernel_initializer,
+            framework="tf2",
+        )
+        cnn_transpose_bias_initializer = get_initializer_fn(
+            cnn_transpose_bias_initializer, framework="tf2"
+        )
+
+        layers = []
+
+        # Input layer.
+        layers.append(tf.keras.layers.Input(shape=input_dims))
+
+        for i, (num_filters, kernel_size, strides) in enumerate(
+            cnn_transpose_filter_specifiers
+        ):
+            is_final_layer = i == len(cnn_transpose_filter_specifiers) - 1
+            layers.append(
+                tf.keras.layers.Conv2DTranspose(
+                    filters=num_filters,
+                    kernel_size=kernel_size,
+                    strides=strides,
+                    padding="same",
+                    # Last layer is never activated (regardless of config).
+                    activation=(
+                        None
+                        if cnn_transpose_use_layernorm or is_final_layer
+                        else cnn_transpose_activation
+                    ),
+                    # Note, if the initializer is `None`, we want TensorFlow
+                    # to use its default one. So we pass in `None`.
+                    kernel_initializer=(
+                        cnn_transpose_kernel_initializer(
+                            **cnn_transpose_kernel_initializer_config
+                        )
+                        if cnn_transpose_kernel_initializer_config
+                        else cnn_transpose_kernel_initializer
+                    ),
+                    # Last layer always uses bias (b/c has no LayerNorm, regardless of
+                    # config).
+                    use_bias=cnn_transpose_use_bias or is_final_layer,
+                    bias_initializer=(
+                        cnn_transpose_bias_initializer(
+                            **cnn_transpose_bias_initializer_config
+                        )
+                        if cnn_transpose_bias_initializer_config
+                        else cnn_transpose_bias_initializer
+                    ),
+                )
+            )
+            if cnn_transpose_use_layernorm and not is_final_layer:
+                # Use epsilon=1e-5 here (instead of default 1e-3) to be unified with
+                # torch. Need to normalize over all axes.
+                layers.append(
+                    tf.keras.layers.LayerNormalization(axis=[-3, -2, -1], epsilon=1e-5)
+                )
+                layers.append(tf.keras.layers.Activation(cnn_transpose_activation))
+
+        # Create the final CNNTranspose network.
+        self.cnn_transpose = tf.keras.Sequential(layers)
+
+        self.expected_input_dtype = tf.float32
+
+    def call(self, inputs, **kwargs):
+        return self.cnn_transpose(tf.cast(inputs, self.expected_input_dtype))
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b3dbfebf5a60865a7461b0d733bdd14325d1f13
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/encoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/encoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..360e8126a45a5d1f600fa14ea245923d7607c632
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/encoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/primitives.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/primitives.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..732b2eadcac1d9b2ec49e9485c083da0260e0849
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/__pycache__/primitives.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/base.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a737ed6cfc9126fc6fce7080ab224b50c3dcc56f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/base.py
@@ -0,0 +1,98 @@
+import abc
+import logging
+from typing import Tuple, Union
+
+import numpy as np
+
+from ray.rllib.core.models.base import Model
+from ray.rllib.core.models.configs import ModelConfig
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+logger = logging.getLogger(__name__)
+
+
+class TorchModel(nn.Module, Model, abc.ABC):
+    """Base class for RLlib's PyTorch models.
+
+    This class defines the interface for RLlib's PyTorch models and checks
+    whether inputs and outputs of forward are checked with `check_input_specs()` and
+    `check_output_specs()` respectively.
+
+    Example usage for a single Flattening layer:
+
+    .. testcode::
+
+        from ray.rllib.core.models.configs import ModelConfig
+        from ray.rllib.core.models.torch.base import TorchModel
+        import torch
+
+        class FlattenModelConfig(ModelConfig):
+            def build(self, framework: str):
+                assert framework == "torch"
+                return TorchFlattenModel(self)
+
+        class TorchFlattenModel(TorchModel):
+            def __init__(self, config):
+                TorchModel.__init__(self, config)
+                self.flatten_layer = torch.nn.Flatten()
+
+            def _forward(self, inputs, **kwargs):
+                return self.flatten_layer(inputs)
+
+        model = FlattenModelConfig().build("torch")
+        inputs = torch.Tensor([[[1, 2]]])
+        print(model(inputs))
+
+    .. testoutput::
+
+        tensor([[1., 2.]])
+
+    """
+
+    def __init__(self, config: ModelConfig):
+        """Initialized a TorchModel.
+
+        Args:
+            config: The ModelConfig to use.
+        """
+        nn.Module.__init__(self)
+        Model.__init__(self, config)
+
+    def forward(
+        self, inputs: Union[dict, TensorType], **kwargs
+    ) -> Union[dict, TensorType]:
+        """Returns the output of this model for the given input.
+
+        This method only makes sure that we have a spec-checked _forward() method.
+
+        Args:
+            inputs: The input tensors.
+            **kwargs: Forward compatibility kwargs.
+
+        Returns:
+            dict: The output tensors.
+        """
+        return self._forward(inputs, **kwargs)
+
+    @override(Model)
+    def get_num_parameters(self) -> Tuple[int, int]:
+        num_all_params = sum(int(np.prod(p.size())) for p in self.parameters())
+        trainable_params = filter(lambda p: p.requires_grad, self.parameters())
+        num_trainable_params = sum(int(np.prod(p.size())) for p in trainable_params)
+        return (
+            num_trainable_params,
+            num_all_params - num_trainable_params,
+        )
+
+    @override(Model)
+    def _set_to_dummy_weights(self, value_sequence=(-0.02, -0.01, 0.01, 0.02)):
+        trainable_weights = [p for p in self.parameters() if p.requires_grad]
+        non_trainable_weights = [p for p in self.parameters() if not p.requires_grad]
+        for i, w in enumerate(trainable_weights + non_trainable_weights):
+            fill_val = value_sequence[i % len(value_sequence)]
+            with torch.no_grad():
+                w.fill_(fill_val)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/encoder.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..82812e43fc61ec4dc80f8395735696cc25935493
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/encoder.py
@@ -0,0 +1,284 @@
+import tree
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.models.base import (
+    Encoder,
+    ActorCriticEncoder,
+    StatefulActorCriticEncoder,
+    ENCODER_OUT,
+)
+from ray.rllib.core.models.base import Model, tokenize
+from ray.rllib.core.models.configs import (
+    ActorCriticEncoderConfig,
+    CNNEncoderConfig,
+    MLPEncoderConfig,
+    RecurrentEncoderConfig,
+)
+from ray.rllib.core.models.torch.base import TorchModel
+from ray.rllib.core.models.torch.primitives import TorchMLP, TorchCNN
+from ray.rllib.models.utils import get_initializer_fn
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+
+torch, nn = try_import_torch()
+
+
+class TorchActorCriticEncoder(TorchModel, ActorCriticEncoder):
+    """An actor-critic encoder for torch."""
+
+    framework = "torch"
+
+    def __init__(self, config: ActorCriticEncoderConfig) -> None:
+        TorchModel.__init__(self, config)
+        ActorCriticEncoder.__init__(self, config)
+
+
+class TorchStatefulActorCriticEncoder(TorchModel, StatefulActorCriticEncoder):
+    """A stateful actor-critic encoder for torch."""
+
+    framework = "torch"
+
+    def __init__(self, config: ActorCriticEncoderConfig) -> None:
+        TorchModel.__init__(self, config)
+        StatefulActorCriticEncoder.__init__(self, config)
+
+
+class TorchMLPEncoder(TorchModel, Encoder):
+    def __init__(self, config: MLPEncoderConfig) -> None:
+        TorchModel.__init__(self, config)
+        Encoder.__init__(self, config)
+
+        # Create the neural network.
+        self.net = TorchMLP(
+            input_dim=config.input_dims[0],
+            hidden_layer_dims=config.hidden_layer_dims,
+            hidden_layer_activation=config.hidden_layer_activation,
+            hidden_layer_use_layernorm=config.hidden_layer_use_layernorm,
+            hidden_layer_use_bias=config.hidden_layer_use_bias,
+            hidden_layer_weights_initializer=config.hidden_layer_weights_initializer,
+            hidden_layer_weights_initializer_config=(
+                config.hidden_layer_weights_initializer_config
+            ),
+            hidden_layer_bias_initializer=config.hidden_layer_bias_initializer,
+            hidden_layer_bias_initializer_config=(
+                config.hidden_layer_bias_initializer_config
+            ),
+            output_dim=config.output_layer_dim,
+            output_activation=config.output_layer_activation,
+            output_use_bias=config.output_layer_use_bias,
+            output_weights_initializer=config.output_layer_weights_initializer,
+            output_weights_initializer_config=(
+                config.output_layer_weights_initializer_config
+            ),
+            output_bias_initializer=config.output_layer_bias_initializer,
+            output_bias_initializer_config=config.output_layer_bias_initializer_config,
+        )
+
+    @override(Model)
+    def _forward(self, inputs: dict, **kwargs) -> dict:
+        return {ENCODER_OUT: self.net(inputs[Columns.OBS])}
+
+
+class TorchCNNEncoder(TorchModel, Encoder):
+    def __init__(self, config: CNNEncoderConfig) -> None:
+        TorchModel.__init__(self, config)
+        Encoder.__init__(self, config)
+
+        layers = []
+        # The bare-bones CNN (no flatten, no succeeding dense).
+        cnn = TorchCNN(
+            input_dims=config.input_dims,
+            cnn_filter_specifiers=config.cnn_filter_specifiers,
+            cnn_activation=config.cnn_activation,
+            cnn_use_layernorm=config.cnn_use_layernorm,
+            cnn_use_bias=config.cnn_use_bias,
+            cnn_kernel_initializer=config.cnn_kernel_initializer,
+            cnn_kernel_initializer_config=config.cnn_kernel_initializer_config,
+            cnn_bias_initializer=config.cnn_bias_initializer,
+            cnn_bias_initializer_config=config.cnn_bias_initializer_config,
+        )
+        layers.append(cnn)
+
+        # Add a flatten operation to move from 2/3D into 1D space.
+        if config.flatten_at_end:
+            layers.append(nn.Flatten())
+
+        # Create the network from gathered layers.
+        self.net = nn.Sequential(*layers)
+
+    @override(Model)
+    def _forward(self, inputs: dict, **kwargs) -> dict:
+        return {ENCODER_OUT: self.net(inputs[Columns.OBS])}
+
+
+class TorchGRUEncoder(TorchModel, Encoder):
+    """A recurrent GRU encoder.
+
+    This encoder has...
+    - Zero or one tokenizers.
+    - One or more GRU layers.
+    """
+
+    def __init__(self, config: RecurrentEncoderConfig) -> None:
+        TorchModel.__init__(self, config)
+
+        # Maybe create a tokenizer
+        if config.tokenizer_config is not None:
+            self.tokenizer = config.tokenizer_config.build(framework="torch")
+            gru_input_dims = config.tokenizer_config.output_dims
+        else:
+            self.tokenizer = None
+            gru_input_dims = config.input_dims
+
+        # We only support 1D spaces right now.
+        assert len(gru_input_dims) == 1
+        gru_input_dim = gru_input_dims[0]
+
+        gru_weights_initializer = get_initializer_fn(
+            config.hidden_weights_initializer, framework="torch"
+        )
+        gru_bias_initializer = get_initializer_fn(
+            config.hidden_bias_initializer, framework="torch"
+        )
+
+        # Create the torch GRU layer.
+        self.gru = nn.GRU(
+            gru_input_dim,
+            config.hidden_dim,
+            config.num_layers,
+            batch_first=config.batch_major,
+            bias=config.use_bias,
+        )
+
+        # Initialize, GRU weights, if necessary.
+        if gru_weights_initializer:
+            gru_weights_initializer(
+                self.gru.weight, **config.hidden_weights_initializer_config or {}
+            )
+        # Initialize GRU bias, if necessary.
+        if gru_bias_initializer:
+            gru_bias_initializer(
+                self.gru.weight, **config.hidden_bias_initializer_config or {}
+            )
+
+    @override(Model)
+    def get_initial_state(self):
+        return {
+            "h": torch.zeros(self.config.num_layers, self.config.hidden_dim),
+        }
+
+    @override(Model)
+    def _forward(self, inputs: dict, **kwargs) -> dict:
+        outputs = {}
+
+        if self.tokenizer is not None:
+            # Push observations through the tokenizer encoder if we built one.
+            out = tokenize(self.tokenizer, inputs, framework="torch")
+        else:
+            # Otherwise, just use the raw observations.
+            out = inputs[Columns.OBS].float()
+
+        # States are batch-first when coming in. Make them layers-first.
+        states_in = tree.map_structure(
+            lambda s: s.transpose(0, 1), inputs[Columns.STATE_IN]
+        )
+
+        out, states_out = self.gru(out, states_in["h"])
+        states_out = {"h": states_out}
+
+        # Insert them into the output dict.
+        outputs[ENCODER_OUT] = out
+        outputs[Columns.STATE_OUT] = tree.map_structure(
+            lambda s: s.transpose(0, 1), states_out
+        )
+        return outputs
+
+
+class TorchLSTMEncoder(TorchModel, Encoder):
+    """A recurrent LSTM encoder.
+
+    This encoder has...
+    - Zero or one tokenizers.
+    - One or more LSTM layers.
+    """
+
+    def __init__(self, config: RecurrentEncoderConfig) -> None:
+        TorchModel.__init__(self, config)
+
+        # Maybe create a tokenizer
+        if config.tokenizer_config is not None:
+            self.tokenizer = config.tokenizer_config.build(framework="torch")
+            lstm_input_dims = config.tokenizer_config.output_dims
+        else:
+            self.tokenizer = None
+            lstm_input_dims = config.input_dims
+
+        # We only support 1D spaces right now.
+        assert len(lstm_input_dims) == 1
+        lstm_input_dim = lstm_input_dims[0]
+
+        lstm_weights_initializer = get_initializer_fn(
+            config.hidden_weights_initializer, framework="torch"
+        )
+        lstm_bias_initializer = get_initializer_fn(
+            config.hidden_bias_initializer, framework="torch"
+        )
+
+        # Create the torch LSTM layer.
+        self.lstm = nn.LSTM(
+            lstm_input_dim,
+            config.hidden_dim,
+            config.num_layers,
+            batch_first=config.batch_major,
+            bias=config.use_bias,
+        )
+
+        # Initialize LSTM layer weigths and biases, if necessary.
+        for layer in self.lstm.all_weights:
+            if lstm_weights_initializer:
+                lstm_weights_initializer(
+                    layer[0], **config.hidden_weights_initializer_config or {}
+                )
+                lstm_weights_initializer(
+                    layer[1], **config.hidden_weights_initializer_config or {}
+                )
+            if lstm_bias_initializer:
+                lstm_bias_initializer(
+                    layer[2], **config.hidden_bias_initializer_config or {}
+                )
+                lstm_bias_initializer(
+                    layer[3], **config.hidden_bias_initializer_config or {}
+                )
+
+    @override(Model)
+    def get_initial_state(self):
+        return {
+            "h": torch.zeros(self.config.num_layers, self.config.hidden_dim),
+            "c": torch.zeros(self.config.num_layers, self.config.hidden_dim),
+        }
+
+    @override(Model)
+    def _forward(self, inputs: dict, **kwargs) -> dict:
+        outputs = {}
+
+        if self.tokenizer is not None:
+            # Push observations through the tokenizer encoder if we built one.
+            out = tokenize(self.tokenizer, inputs, framework="torch")
+        else:
+            # Otherwise, just use the raw observations.
+            out = inputs[Columns.OBS].float()
+
+        # States are batch-first when coming in. Make them layers-first.
+        states_in = tree.map_structure(
+            lambda s: s.transpose(0, 1), inputs[Columns.STATE_IN]
+        )
+
+        out, states_out = self.lstm(out, (states_in["h"], states_in["c"]))
+        states_out = {"h": states_out[0], "c": states_out[1]}
+
+        # Insert them into the output dict.
+        outputs[ENCODER_OUT] = out
+        outputs[Columns.STATE_OUT] = tree.map_structure(
+            lambda s: s.transpose(0, 1), states_out
+        )
+        return outputs
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/heads.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..844c40c4a44cbe0617c5008b0464acc20ed17653
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/heads.py
@@ -0,0 +1,197 @@
+import numpy as np
+
+from ray.rllib.core.models.base import Model
+from ray.rllib.core.models.configs import (
+    CNNTransposeHeadConfig,
+    FreeLogStdMLPHeadConfig,
+    MLPHeadConfig,
+)
+from ray.rllib.core.models.torch.base import TorchModel
+from ray.rllib.core.models.torch.primitives import TorchCNNTranspose, TorchMLP
+from ray.rllib.models.utils import get_initializer_fn
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+
+torch, nn = try_import_torch()
+
+
+class TorchMLPHead(TorchModel):
+    def __init__(self, config: MLPHeadConfig) -> None:
+        super().__init__(config)
+
+        self.net = TorchMLP(
+            input_dim=config.input_dims[0],
+            hidden_layer_dims=config.hidden_layer_dims,
+            hidden_layer_activation=config.hidden_layer_activation,
+            hidden_layer_use_layernorm=config.hidden_layer_use_layernorm,
+            hidden_layer_use_bias=config.hidden_layer_use_bias,
+            hidden_layer_weights_initializer=config.hidden_layer_weights_initializer,
+            hidden_layer_weights_initializer_config=(
+                config.hidden_layer_weights_initializer_config
+            ),
+            hidden_layer_bias_initializer=config.hidden_layer_bias_initializer,
+            hidden_layer_bias_initializer_config=(
+                config.hidden_layer_bias_initializer_config
+            ),
+            output_dim=config.output_layer_dim,
+            output_activation=config.output_layer_activation,
+            output_use_bias=config.output_layer_use_bias,
+            output_weights_initializer=config.output_layer_weights_initializer,
+            output_weights_initializer_config=(
+                config.output_layer_weights_initializer_config
+            ),
+            output_bias_initializer=config.output_layer_bias_initializer,
+            output_bias_initializer_config=config.output_layer_bias_initializer_config,
+        )
+        # If log standard deviations should be clipped. This should be only true for
+        # policy heads. Value heads should never be clipped.
+        self.clip_log_std = config.clip_log_std
+        # The clipping parameter for the log standard deviation.
+        self.log_std_clip_param = torch.Tensor([config.log_std_clip_param])
+        # Register a buffer to handle device mapping.
+        self.register_buffer("log_std_clip_param_const", self.log_std_clip_param)
+
+    @override(Model)
+    def _forward(self, inputs: torch.Tensor, **kwargs) -> torch.Tensor:
+        # Only clip the log standard deviations, if the user wants to clip. This
+        # avoids also clipping value heads.
+        if self.clip_log_std:
+            # Forward pass.
+            means, log_stds = torch.chunk(self.net(inputs), chunks=2, dim=-1)
+            # Clip the log standard deviations.
+            log_stds = torch.clamp(
+                log_stds, -self.log_std_clip_param_const, self.log_std_clip_param_const
+            )
+            return torch.cat((means, log_stds), dim=-1)
+        # Otherwise just return the logits.
+        else:
+            return self.net(inputs)
+
+
+class TorchFreeLogStdMLPHead(TorchModel):
+    """An MLPHead that implements floating log stds for Gaussian distributions."""
+
+    def __init__(self, config: FreeLogStdMLPHeadConfig) -> None:
+        super().__init__(config)
+
+        assert config.output_dims[0] % 2 == 0, "output_dims must be even for free std!"
+        self._half_output_dim = config.output_dims[0] // 2
+
+        self.net = TorchMLP(
+            input_dim=config.input_dims[0],
+            hidden_layer_dims=config.hidden_layer_dims,
+            hidden_layer_activation=config.hidden_layer_activation,
+            hidden_layer_use_layernorm=config.hidden_layer_use_layernorm,
+            hidden_layer_use_bias=config.hidden_layer_use_bias,
+            hidden_layer_weights_initializer=config.hidden_layer_weights_initializer,
+            hidden_layer_weights_initializer_config=(
+                config.hidden_layer_weights_initializer_config
+            ),
+            hidden_layer_bias_initializer=config.hidden_layer_bias_initializer,
+            hidden_layer_bias_initializer_config=(
+                config.hidden_layer_bias_initializer_config
+            ),
+            output_dim=self._half_output_dim,
+            output_activation=config.output_layer_activation,
+            output_use_bias=config.output_layer_use_bias,
+            output_weights_initializer=config.output_layer_weights_initializer,
+            output_weights_initializer_config=(
+                config.output_layer_weights_initializer_config
+            ),
+            output_bias_initializer=config.output_layer_bias_initializer,
+            output_bias_initializer_config=config.output_layer_bias_initializer_config,
+        )
+
+        self.log_std = torch.nn.Parameter(
+            torch.as_tensor([0.0] * self._half_output_dim)
+        )
+        # If log standard deviations should be clipped. This should be only true for
+        # policy heads. Value heads should never be clipped.
+        self.clip_log_std = config.clip_log_std
+        # The clipping parameter for the log standard deviation.
+        self.log_std_clip_param = torch.Tensor(
+            [config.log_std_clip_param], device=self.log_std.device
+        )
+        # Register a buffer to handle device mapping.
+        self.register_buffer("log_std_clip_param_const", self.log_std_clip_param)
+
+    @override(Model)
+    def _forward(self, inputs: torch.Tensor, **kwargs) -> torch.Tensor:
+        # Compute the mean first, then append the log_std.
+        mean = self.net(inputs)
+
+        # If log standard deviation should be clipped.
+        if self.clip_log_std:
+            # Clip the log standard deviation to avoid running into too small
+            # deviations that factually collapses the policy.
+            log_std = torch.clamp(
+                self.log_std,
+                -self.log_std_clip_param_const,
+                self.log_std_clip_param_const,
+            )
+        else:
+            log_std = self.log_std
+
+        return torch.cat([mean, log_std.unsqueeze(0).repeat([len(mean), 1])], axis=1)
+
+
+class TorchCNNTransposeHead(TorchModel):
+    def __init__(self, config: CNNTransposeHeadConfig) -> None:
+        super().__init__(config)
+
+        # Initial, inactivated Dense layer (always w/ bias).
+        # This layer is responsible for getting the incoming tensor into a proper
+        # initial image shape (w x h x filters) for the suceeding Conv2DTranspose stack.
+        self.initial_dense = nn.Linear(
+            in_features=config.input_dims[0],
+            out_features=int(np.prod(config.initial_image_dims)),
+            bias=True,
+        )
+
+        # Initial Dense layer initializers.
+        initial_dense_weights_initializer = get_initializer_fn(
+            config.initial_dense_weights_initializer, framework="torch"
+        )
+        initial_dense_bias_initializer = get_initializer_fn(
+            config.initial_dense_bias_initializer, framework="torch"
+        )
+
+        # Initialize dense layer weights, if necessary.
+        if initial_dense_weights_initializer:
+            initial_dense_weights_initializer(
+                self.initial_dense.weight,
+                **config.initial_dense_weights_initializer_config or {},
+            )
+        # Initialized dense layer bais, if necessary.
+        if initial_dense_bias_initializer:
+            initial_dense_bias_initializer(
+                self.initial_dense.bias,
+                **config.initial_dense_bias_initializer_config or {},
+            )
+
+        # The main CNNTranspose stack.
+        self.cnn_transpose_net = TorchCNNTranspose(
+            input_dims=config.initial_image_dims,
+            cnn_transpose_filter_specifiers=config.cnn_transpose_filter_specifiers,
+            cnn_transpose_activation=config.cnn_transpose_activation,
+            cnn_transpose_use_layernorm=config.cnn_transpose_use_layernorm,
+            cnn_transpose_use_bias=config.cnn_transpose_use_bias,
+            cnn_transpose_kernel_initializer=config.cnn_transpose_kernel_initializer,
+            cnn_transpose_kernel_initializer_config=(
+                config.cnn_transpose_kernel_initializer_config
+            ),
+            cnn_transpose_bias_initializer=config.cnn_transpose_bias_initializer,
+            cnn_transpose_bias_initializer_config=(
+                config.cnn_transpose_bias_initializer_config
+            ),
+        )
+
+    @override(Model)
+    def _forward(self, inputs: torch.Tensor, **kwargs) -> torch.Tensor:
+        out = self.initial_dense(inputs)
+        # Reshape to initial 3D (image-like) format to enter CNN transpose stack.
+        out = out.reshape((-1,) + tuple(self.config.initial_image_dims))
+        out = self.cnn_transpose_net(out)
+        # Add 0.5 to center (always non-activated, non-normalized) outputs more
+        # around 0.0.
+        return out + 0.5
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/primitives.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/primitives.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c4e5574351093e658cf08aa387c172ad4c042a4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/primitives.py
@@ -0,0 +1,479 @@
+from typing import Callable, Dict, List, Optional, Union, Tuple
+
+from ray.rllib.core.models.torch.utils import Stride2D
+from ray.rllib.models.torch.misc import (
+    same_padding,
+    same_padding_transpose_after_stride,
+    valid_padding,
+)
+from ray.rllib.models.utils import get_activation_fn, get_initializer_fn
+from ray.rllib.utils.framework import try_import_torch
+
+torch, nn = try_import_torch()
+
+
+class TorchMLP(nn.Module):
+    """A multi-layer perceptron with N dense layers.
+
+    All layers (except for an optional additional extra output layer) share the same
+    activation function, bias setup (use bias or not), and LayerNorm setup
+    (use layer normalization or not).
+
+    If `output_dim` (int) is not None, an additional, extra output dense layer is added,
+    which might have its own activation function (e.g. "linear"). However, the output
+    layer does NOT use layer normalization.
+    """
+
+    def __init__(
+        self,
+        *,
+        input_dim: int,
+        hidden_layer_dims: List[int],
+        hidden_layer_activation: Union[str, Callable] = "relu",
+        hidden_layer_use_bias: bool = True,
+        hidden_layer_use_layernorm: bool = False,
+        hidden_layer_weights_initializer: Optional[Union[str, Callable]] = None,
+        hidden_layer_weights_initializer_config: Optional[Union[str, Callable]] = None,
+        hidden_layer_bias_initializer: Optional[Union[str, Callable]] = None,
+        hidden_layer_bias_initializer_config: Optional[Dict] = None,
+        output_dim: Optional[int] = None,
+        output_use_bias: bool = True,
+        output_activation: Union[str, Callable] = "linear",
+        output_weights_initializer: Optional[Union[str, Callable]] = None,
+        output_weights_initializer_config: Optional[Dict] = None,
+        output_bias_initializer: Optional[Union[str, Callable]] = None,
+        output_bias_initializer_config: Optional[Dict] = None,
+    ):
+        """Initialize a TorchMLP object.
+
+        Args:
+            input_dim: The input dimension of the network. Must not be None.
+            hidden_layer_dims: The sizes of the hidden layers. If an empty list, only a
+                single layer will be built of size `output_dim`.
+            hidden_layer_use_layernorm: Whether to insert a LayerNormalization
+                functionality in between each hidden layer's output and its activation.
+            hidden_layer_use_bias: Whether to use bias on all dense layers (excluding
+                the possible separate output layer).
+            hidden_layer_activation: The activation function to use after each layer
+                (except for the output). Either a torch.nn.[activation fn] callable or
+                the name thereof, or an RLlib recognized activation name,
+                e.g. "ReLU", "relu", "tanh", "SiLU", or "linear".
+            hidden_layer_weights_initializer: The initializer function or class to use
+                forweights initialization in the hidden layers. If `None` the default
+                initializer of the respective dense layer is used. Note, only the
+                in-place initializers, i.e. ending with an underscore "_" are allowed.
+            hidden_layer_weights_initializer_config: Configuration to pass into the
+                initializer defined in `hidden_layer_weights_initializer`.
+            hidden_layer_bias_initializer: The initializer function or class to use for
+                bias initialization in the hidden layers. If `None` the default
+                initializer of the respective dense layer is used. Note, only the
+                in-place initializers, i.e. ending with an underscore "_" are allowed.
+            hidden_layer_bias_initializer_config: Configuration to pass into the
+                initializer defined in `hidden_layer_bias_initializer`.
+            output_dim: The output dimension of the network. If None, no specific output
+                layer will be added and the last layer in the stack will have
+                size=`hidden_layer_dims[-1]`.
+            output_use_bias: Whether to use bias on the separate output layer,
+                if any.
+            output_activation: The activation function to use for the output layer
+                (if any). Either a torch.nn.[activation fn] callable or
+                the name thereof, or an RLlib recognized activation name,
+                e.g. "ReLU", "relu", "tanh", "SiLU", or "linear".
+            output_layer_weights_initializer: The initializer function or class to use
+                for weights initialization in the output layers. If `None` the default
+                initializer of the respective dense layer is used. Note, only the
+                in-place initializers, i.e. ending with an underscore "_" are allowed.
+            output_layer_weights_initializer_config: Configuration to pass into the
+                initializer defined in `output_layer_weights_initializer`.
+            output_layer_bias_initializer: The initializer function or class to use for
+                bias initialization in the output layers. If `None` the default
+                initializer of the respective dense layer is used. Note, only the
+                in-place initializers, i.e. ending with an underscore "_" are allowed.
+            output_layer_bias_initializer_config: Configuration to pass into the
+                initializer defined in `output_layer_bias_initializer`.
+        """
+        super().__init__()
+        assert input_dim > 0
+
+        self.input_dim = input_dim
+
+        hidden_activation = get_activation_fn(
+            hidden_layer_activation, framework="torch"
+        )
+        hidden_weights_initializer = get_initializer_fn(
+            hidden_layer_weights_initializer, framework="torch"
+        )
+        hidden_bias_initializer = get_initializer_fn(
+            hidden_layer_bias_initializer, framework="torch"
+        )
+        output_weights_initializer = get_initializer_fn(
+            output_weights_initializer, framework="torch"
+        )
+        output_bias_initializer = get_initializer_fn(
+            output_bias_initializer, framework="torch"
+        )
+
+        layers = []
+        dims = (
+            [self.input_dim]
+            + list(hidden_layer_dims)
+            + ([output_dim] if output_dim else [])
+        )
+        for i in range(0, len(dims) - 1):
+            # Whether we are already processing the last (special) output layer.
+            is_output_layer = output_dim is not None and i == len(dims) - 2
+
+            layer = nn.Linear(
+                dims[i],
+                dims[i + 1],
+                bias=output_use_bias if is_output_layer else hidden_layer_use_bias,
+            )
+            # Initialize layers, if necessary.
+            if is_output_layer:
+                # Initialize output layer weigths if necessary.
+                if output_weights_initializer:
+                    output_weights_initializer(
+                        layer.weight, **output_weights_initializer_config or {}
+                    )
+                # Initialize output layer bias if necessary.
+                if output_bias_initializer:
+                    output_bias_initializer(
+                        layer.bias, **output_bias_initializer_config or {}
+                    )
+            # Must be hidden.
+            else:
+                # Initialize hidden layer weights if necessary.
+                if hidden_layer_weights_initializer:
+                    hidden_weights_initializer(
+                        layer.weight, **hidden_layer_weights_initializer_config or {}
+                    )
+                # Initialize hidden layer bias if necessary.
+                if hidden_layer_bias_initializer:
+                    hidden_bias_initializer(
+                        layer.bias, **hidden_layer_bias_initializer_config or {}
+                    )
+
+            layers.append(layer)
+
+            # We are still in the hidden layer section: Possibly add layernorm and
+            # hidden activation.
+            if not is_output_layer:
+                # Insert a layer normalization in between layer's output and
+                # the activation.
+                if hidden_layer_use_layernorm:
+                    # We use an epsilon of 0.001 here to mimick the Tf default behavior.
+                    layers.append(nn.LayerNorm(dims[i + 1], eps=0.001))
+                # Add the activation function.
+                if hidden_activation is not None:
+                    layers.append(hidden_activation())
+
+        # Add output layer's (if any) activation.
+        output_activation = get_activation_fn(output_activation, framework="torch")
+        if output_dim is not None and output_activation is not None:
+            layers.append(output_activation())
+
+        self.mlp = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.mlp(x)
+
+
+class TorchCNN(nn.Module):
+    """A model containing a CNN with N Conv2D layers.
+
+    All layers share the same activation function, bias setup (use bias or not),
+    and LayerNorm setup (use layer normalization or not).
+
+    Note that there is no flattening nor an additional dense layer at the end of the
+    stack. The output of the network is a 3D tensor of dimensions
+    [width x height x num output filters].
+    """
+
+    def __init__(
+        self,
+        *,
+        input_dims: Union[List[int], Tuple[int]],
+        cnn_filter_specifiers: List[List[Union[int, List]]],
+        cnn_use_bias: bool = True,
+        cnn_use_layernorm: bool = False,
+        cnn_activation: str = "relu",
+        cnn_kernel_initializer: Optional[Union[str, Callable]] = None,
+        cnn_kernel_initializer_config: Optional[Dict] = None,
+        cnn_bias_initializer: Optional[Union[str, Callable]] = None,
+        cnn_bias_initializer_config: Optional[Dict] = None,
+    ):
+        """Initializes a TorchCNN instance.
+
+        Args:
+            input_dims: The 3D input dimensions of the network (incoming image).
+            cnn_filter_specifiers: A list in which each element is another (inner) list
+                of either the following forms:
+                `[number of channels/filters, kernel, stride]`
+                OR:
+                `[number of channels/filters, kernel, stride, padding]`, where `padding`
+                can either be "same" or "valid".
+                When using the first format w/o the `padding` specifier, `padding` is
+                "same" by default. Also, `kernel` and `stride` may be provided either as
+                single ints (square) or as a tuple/list of two ints (width- and height
+                dimensions) for non-squared kernel/stride shapes.
+                A good rule of thumb for constructing CNN stacks is:
+                When using padding="same", the input "image" will be reduced in size by
+                the factor `stride`, e.g. input=(84, 84, 3) stride=2 kernel=x
+                padding="same" filters=16 -> output=(42, 42, 16).
+                For example, if you would like to reduce an Atari image from its
+                original (84, 84, 3) dimensions down to (6, 6, F), you can construct the
+                following stack and reduce the w x h dimension of the image by 2 in each
+                layer:
+                [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]] -> output=(6, 6, 128)
+            cnn_use_bias: Whether to use bias on all Conv2D layers.
+            cnn_activation: The activation function to use after each Conv2D layer.
+            cnn_use_layernorm: Whether to insert a LayerNormalization functionality
+                in between each Conv2D layer's outputs and its activation.
+            cnn_kernel_initializer: The initializer function or class to use for kernel
+                initialization in the CNN layers. If `None` the default initializer of
+                the respective CNN layer is used. Note, only the in-place
+                initializers, i.e. ending with an underscore "_" are allowed.
+            cnn_kernel_initializer_config: Configuration to pass into the initializer
+                defined in `cnn_kernel_initializer`.
+            cnn_bias_initializer: The initializer function or class to use for bias
+                initializationcin the CNN layers. If `None` the default initializer of
+                the respective CNN layer is used. Note, only the in-place initializers,
+                i.e. ending with an underscore "_" are allowed.
+            cnn_bias_initializer_config: Configuration to pass into the initializer
+                defined in `cnn_bias_initializer`.
+        """
+        super().__init__()
+
+        assert len(input_dims) == 3
+
+        cnn_activation = get_activation_fn(cnn_activation, framework="torch")
+        cnn_kernel_initializer = get_initializer_fn(
+            cnn_kernel_initializer, framework="torch"
+        )
+        cnn_bias_initializer = get_initializer_fn(
+            cnn_bias_initializer, framework="torch"
+        )
+        layers = []
+
+        # Add user-specified hidden convolutional layers first
+        width, height, in_depth = input_dims
+        in_size = [width, height]
+        for filter_specs in cnn_filter_specifiers:
+            # Padding information not provided -> Use "same" as default.
+            if len(filter_specs) == 3:
+                out_depth, kernel_size, strides = filter_specs
+                padding = "same"
+            # Padding information provided.
+            else:
+                out_depth, kernel_size, strides, padding = filter_specs
+
+            # Pad like in tensorflow's SAME/VALID mode.
+            if padding == "same":
+                padding_size, out_size = same_padding(in_size, kernel_size, strides)
+                layers.append(nn.ZeroPad2d(padding_size))
+            # No actual padding is performed for "valid" mode, but we will still
+            # compute the output size (input for the next layer).
+            else:
+                out_size = valid_padding(in_size, kernel_size, strides)
+
+            layer = nn.Conv2d(
+                in_depth, out_depth, kernel_size, strides, bias=cnn_use_bias
+            )
+
+            # Initialize CNN layer kernel if necessary.
+            if cnn_kernel_initializer:
+                cnn_kernel_initializer(
+                    layer.weight, **cnn_kernel_initializer_config or {}
+                )
+            # Initialize CNN layer bias if necessary.
+            if cnn_bias_initializer:
+                cnn_bias_initializer(layer.bias, **cnn_bias_initializer_config or {})
+
+            layers.append(layer)
+
+            # Layernorm.
+            if cnn_use_layernorm:
+                # We use an epsilon of 0.001 here to mimick the Tf default behavior.
+                layers.append(LayerNorm1D(out_depth, eps=0.001))
+            # Activation.
+            if cnn_activation is not None:
+                layers.append(cnn_activation())
+
+            in_size = out_size
+            in_depth = out_depth
+
+        # Create the CNN.
+        self.cnn = nn.Sequential(*layers)
+
+    def forward(self, inputs):
+        # Permute b/c data comes in as channels_last ([B, dim, dim, channels]) ->
+        # Convert to `channels_first` for torch:
+        inputs = inputs.permute(0, 3, 1, 2)
+        out = self.cnn(inputs)
+        # Permute back to `channels_last`.
+        return out.permute(0, 2, 3, 1)
+
+
+class TorchCNNTranspose(nn.Module):
+    """A model containing a CNNTranspose with N Conv2DTranspose layers.
+
+    All layers share the same activation function, bias setup (use bias or not),
+    and LayerNormalization setup (use layer normalization or not), except for the last
+    one, which is never activated and never layer norm'd.
+
+    Note that there is no reshaping/flattening nor an additional dense layer at the
+    beginning or end of the stack. The input as well as output of the network are 3D
+    tensors of dimensions [width x height x num output filters].
+    """
+
+    def __init__(
+        self,
+        *,
+        input_dims: Union[List[int], Tuple[int]],
+        cnn_transpose_filter_specifiers: List[List[Union[int, List]]],
+        cnn_transpose_use_bias: bool = True,
+        cnn_transpose_activation: str = "relu",
+        cnn_transpose_use_layernorm: bool = False,
+        cnn_transpose_kernel_initializer: Optional[Union[str, Callable]] = None,
+        cnn_transpose_kernel_initializer_config: Optional[Dict] = None,
+        cnn_transpose_bias_initializer: Optional[Union[str, Callable]] = None,
+        cnn_transpose_bias_initializer_config: Optional[Dict] = None,
+    ):
+        """Initializes a TorchCNNTranspose instance.
+
+        Args:
+            input_dims: The 3D input dimensions of the network (incoming image).
+            cnn_transpose_filter_specifiers: A list of lists, where each item represents
+                one Conv2DTranspose layer. Each such Conv2DTranspose layer is further
+                specified by the elements of the inner lists. The inner lists follow
+                the format: `[number of filters, kernel, stride]` to
+                specify a convolutional-transpose layer stacked in order of the
+                outer list.
+                `kernel` as well as `stride` might be provided as width x height tuples
+                OR as single ints representing both dimension (width and height)
+                in case of square shapes.
+            cnn_transpose_use_bias: Whether to use bias on all Conv2DTranspose layers.
+            cnn_transpose_use_layernorm: Whether to insert a LayerNormalization
+                functionality in between each Conv2DTranspose layer's outputs and its
+                activation.
+                The last Conv2DTranspose layer will not be normed, regardless.
+            cnn_transpose_activation: The activation function to use after each layer
+                (except for the last Conv2DTranspose layer, which is always
+                non-activated).
+            cnn_transpose_kernel_initializer: The initializer function or class to use
+                for kernel initialization in the CNN layers. If `None` the default
+                initializer of the respective CNN layer is used. Note, only the
+                in-place initializers, i.e. ending with an underscore "_" are allowed.
+            cnn_transpose_kernel_initializer_config: Configuration to pass into the
+                initializer defined in `cnn_transpose_kernel_initializer`.
+            cnn_transpose_bias_initializer: The initializer function or class to use for
+                bias initialization in the CNN layers. If `None` the default initializer
+                of the respective CNN layer is used. Note, only the in-place
+                initializers, i.e. ending with an underscore "_" are allowed.
+            cnn_transpose_bias_initializer_config: Configuration to pass into the
+                initializer defined in `cnn_transpose_bias_initializer`.
+        """
+        super().__init__()
+
+        assert len(input_dims) == 3
+
+        cnn_transpose_activation = get_activation_fn(
+            cnn_transpose_activation, framework="torch"
+        )
+        cnn_transpose_kernel_initializer = get_initializer_fn(
+            cnn_transpose_kernel_initializer, framework="torch"
+        )
+        cnn_transpose_bias_initializer = get_initializer_fn(
+            cnn_transpose_bias_initializer, framework="torch"
+        )
+
+        layers = []
+
+        # Add user-specified hidden convolutional layers first
+        width, height, in_depth = input_dims
+        in_size = [width, height]
+        for i, (out_depth, kernel, stride) in enumerate(
+            cnn_transpose_filter_specifiers
+        ):
+            is_final_layer = i == len(cnn_transpose_filter_specifiers) - 1
+
+            # Resolve stride and kernel width/height values if only int given (squared).
+            s_w, s_h = (stride, stride) if isinstance(stride, int) else stride
+            k_w, k_h = (kernel, kernel) if isinstance(kernel, int) else kernel
+
+            # Stride the incoming image first.
+            stride_layer = Stride2D(in_size[0], in_size[1], s_w, s_h)
+            layers.append(stride_layer)
+            # Then 0-pad (like in tensorflow's SAME mode).
+            # This will return the necessary padding such that for stride=1, the output
+            # image has the same size as the input image, for stride=2, the output image
+            # is 2x the input image, etc..
+            padding, out_size = same_padding_transpose_after_stride(
+                (stride_layer.out_width, stride_layer.out_height), kernel, stride
+            )
+            layers.append(nn.ZeroPad2d(padding))  # left, right, top, bottom
+            # Then do the Conv2DTranspose operation
+            # (now that we have padded and strided manually, w/o any more padding using
+            # stride=1).
+
+            layer = nn.ConvTranspose2d(
+                in_depth,
+                out_depth,
+                kernel,
+                # Force-set stride to 1 as we already took care of it.
+                1,
+                # Disable torch auto-padding (torch interprets the padding setting
+                # as: dilation (==1.0) * [`kernel` - 1] - [`padding`]).
+                padding=(k_w - 1, k_h - 1),
+                # Last layer always uses bias (b/c has no LayerNorm, regardless of
+                # config).
+                bias=cnn_transpose_use_bias or is_final_layer,
+            )
+
+            # Initialize CNN Transpose layer kernel if necessary.
+            if cnn_transpose_kernel_initializer:
+                cnn_transpose_kernel_initializer(
+                    layer.weight, **cnn_transpose_kernel_initializer_config or {}
+                )
+            # Initialize CNN Transpose layer bias if necessary.
+            if cnn_transpose_bias_initializer:
+                cnn_transpose_bias_initializer(
+                    layer.bias, **cnn_transpose_bias_initializer_config or {}
+                )
+
+            layers.append(layer)
+            # Layernorm (never for final layer).
+            if cnn_transpose_use_layernorm and not is_final_layer:
+                layers.append(LayerNorm1D(out_depth, eps=0.001))
+            # Last layer is never activated (regardless of config).
+            if cnn_transpose_activation is not None and not is_final_layer:
+                layers.append(cnn_transpose_activation())
+
+            in_size = (out_size[0], out_size[1])
+            in_depth = out_depth
+
+        # Create the final CNNTranspose network.
+        self.cnn_transpose = nn.Sequential(*layers)
+
+    def forward(self, inputs):
+        # Permute b/c data comes in as [B, dim, dim, channels]:
+        out = inputs.permute(0, 3, 1, 2)
+        out = self.cnn_transpose(out)
+        return out.permute(0, 2, 3, 1)
+
+
+class LayerNorm1D(nn.Module):
+    def __init__(self, num_features, **kwargs):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(num_features, **kwargs)
+
+    def forward(self, x):
+        # x shape: (B, dim, dim, channels).
+        batch_size, channels, h, w = x.size()
+        # Reshape to (batch_size * height * width, channels) for LayerNorm
+        x = x.permute(0, 2, 3, 1).reshape(-1, channels)
+        # Apply LayerNorm
+        x = self.layer_norm(x)
+        # Reshape back to (batch_size, dim, dim, channels)
+        x = x.reshape(batch_size, h, w, channels).permute(0, 3, 1, 2)
+        return x
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bdbdef016f4c8a7261ff83f14d9c70810d6a90e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/models/torch/utils.py
@@ -0,0 +1,85 @@
+from ray.rllib.utils.framework import try_import_torch
+
+torch, nn = try_import_torch()
+
+
+class Stride2D(nn.Module):
+    """A striding layer for doing torch Conv2DTranspose operations.
+
+    Using this layer before the 0-padding (on a 3D input "image") and before
+    the actual ConvTranspose2d allows for a padding="same" behavior that matches
+    100% that of a `tf.keras.layers.Conv2DTranspose` layer.
+
+    Examples:
+        Input image (4x4):
+        A B C D
+        E F G H
+        I J K L
+        M N O P
+
+        Stride with stride=2 -> output image=(7x7)
+        A 0 B 0 C 0 D
+        0 0 0 0 0 0 0
+        E 0 F 0 G 0 H
+        0 0 0 0 0 0 0
+        I 0 J 0 K 0 L
+        0 0 0 0 0 0 0
+        M 0 N 0 O 0 P
+    """
+
+    def __init__(self, width, height, stride_w, stride_h):
+        """Initializes a Stride2D instance.
+
+        Args:
+            width: The width of the 3D input "image".
+            height: The height of the 3D input "image".
+            stride_w: The stride in width direction, with which to stride the incoming
+                image.
+            stride_h: The stride in height direction, with which to stride the incoming
+                image.
+        """
+        super().__init__()
+
+        self.width = width
+        self.height = height
+        self.stride_w = stride_w
+        self.stride_h = stride_h
+
+        self.register_buffer(
+            "zeros",
+            torch.zeros(
+                size=(
+                    self.width * self.stride_w - (self.stride_w - 1),
+                    self.height * self.stride_h - (self.stride_h - 1),
+                ),
+                dtype=torch.float32,
+            ),
+        )
+
+        self.out_width, self.out_height = self.zeros.shape[0], self.zeros.shape[1]
+        # Squeeze in batch and channel dims.
+        self.zeros = self.zeros.unsqueeze(0).unsqueeze(0)
+
+        where_template = torch.zeros(
+            (self.stride_w, self.stride_h), dtype=torch.float32
+        )
+        # Set upper/left corner to 1.0.
+        where_template[0][0] = 1.0
+        # then tile across the entire (strided) image size.
+        where_template = where_template.repeat((self.height, self.width))[
+            : -(self.stride_w - 1), : -(self.stride_h - 1)
+        ]
+        # Squeeze in batch and channel dims and convert to bool.
+        where_template = where_template.unsqueeze(0).unsqueeze(0).bool()
+        self.register_buffer("where_template", where_template)
+
+    def forward(self, x):
+        # Repeat incoming image stride(w/h) times to match the strided output template.
+        repeated_x = (
+            x.repeat_interleave(self.stride_w, dim=-2).repeat_interleave(
+                self.stride_h, dim=-1
+            )
+        )[:, :, : -(self.stride_w - 1), : -(self.stride_h - 1)]
+        # Where `self.where_template` == 1.0 -> Use image pixel, otherwise use
+        # zero filler value.
+        return torch.where(self.where_template, repeated_x, self.zeros)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e35a487280c1a44fb1aced1815adc2863bd11ad8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/bc_algorithm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/bc_algorithm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08776a436e2bc3be140e6a0b1cca8d6b6e7199ab
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/bc_algorithm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/testing_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/testing_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..555cd758ceee3f06052209e2495ac881e04c43d5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/__pycache__/testing_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/bc_algorithm.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/bc_algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5c3bdbf50fb18c394d546ec8a0f455706ebd0e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/bc_algorithm.py
@@ -0,0 +1,49 @@
+"""Contains example implementation of a custom algorithm.
+
+Note: It doesn't include any real use-case functionality; it only serves as an example
+to test the algorithm construction and customization.
+"""
+
+from ray.rllib.algorithms import Algorithm, AlgorithmConfig
+from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
+from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
+from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
+from ray.rllib.core.testing.torch.bc_learner import BCTorchLearner
+from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
+from ray.rllib.core.testing.tf.bc_learner import BCTfLearner
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import ResultDict
+
+
+class BCConfigTest(AlgorithmConfig):
+    def __init__(self, algo_class=None):
+        super().__init__(algo_class=algo_class or BCAlgorithmTest)
+
+    def get_default_rl_module_spec(self):
+        if self.framework_str == "torch":
+            return RLModuleSpec(module_class=DiscreteBCTorchModule)
+        elif self.framework_str == "tf2":
+            return RLModuleSpec(module_class=DiscreteBCTFModule)
+
+    def get_default_learner_class(self):
+        if self.framework_str == "torch":
+            return BCTorchLearner
+        elif self.framework_str == "tf2":
+            return BCTfLearner
+
+
+class BCAlgorithmTest(Algorithm):
+    @classmethod
+    def get_default_policy_class(cls, config: AlgorithmConfig):
+        if config.framework_str == "torch":
+            return TorchPolicyV2
+        elif config.framework_str == "tf2":
+            return EagerTFPolicyV2
+        else:
+            raise ValueError("Unknown framework: {}".format(config.framework_str))
+
+    @override(Algorithm)
+    def training_step(self) -> ResultDict:
+        # do nothing.
+        return {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/testing_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/testing_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e43dd098aa76231462291c5363d959e1c3e2a20
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/testing_learner.py
@@ -0,0 +1,75 @@
+from typing import Type
+
+import numpy as np
+
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.rl_module.multi_rl_module import (
+    MultiRLModule,
+    MultiRLModuleSpec,
+)
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import RLModuleSpecType
+
+
+class BaseTestingAlgorithmConfig(AlgorithmConfig):
+    # A test setting to activate metrics on mean weights.
+    report_mean_weights: bool = True
+
+    @override(AlgorithmConfig)
+    def get_default_learner_class(self) -> Type["Learner"]:
+        if self.framework_str == "tf2":
+            from ray.rllib.core.testing.tf.bc_learner import BCTfLearner
+
+            return BCTfLearner
+        elif self.framework_str == "torch":
+            from ray.rllib.core.testing.torch.bc_learner import BCTorchLearner
+
+            return BCTorchLearner
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework_str}")
+
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self) -> "RLModuleSpecType":
+        if self.framework_str == "tf2":
+            from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule
+
+            cls = DiscreteBCTFModule
+        elif self.framework_str == "torch":
+            from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule
+
+            cls = DiscreteBCTorchModule
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework_str}")
+
+        spec = RLModuleSpec(
+            module_class=cls,
+            model_config={"fcnet_hiddens": [32]},
+        )
+
+        if self.is_multi_agent:
+            # TODO (Kourosh): Make this more multi-agent for example with policy ids
+            #  "1" and "2".
+            return MultiRLModuleSpec(
+                multi_rl_module_class=MultiRLModule,
+                rl_module_specs={DEFAULT_MODULE_ID: spec},
+            )
+        else:
+            return spec
+
+
+class BaseTestingLearner(Learner):
+    @override(Learner)
+    def after_gradient_based_update(self, *, timesteps):
+        # This is to check if in the multi-gpu case, the weights across workers are
+        # the same. It is really only needed during testing.
+        if self.config.report_mean_weights:
+            for module_id in self.module.keys():
+                parameters = convert_to_numpy(
+                    self.get_parameters(self.module[module_id])
+                )
+                mean_ws = np.mean([w.mean() for w in parameters])
+                self.metrics.log_value((module_id, "mean_weight"), mean_ws, window=1)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07b0eabdfd71096e3ad9b28a66fcb7522110116b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/bc_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/bc_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..373bee6ba27970d33e16833611b9d6c3cbb99a7c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/bc_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/bc_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/bc_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccea5bfb162ee9c7367d5cd2e99d14b4e97d46d5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/__pycache__/bc_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/bc_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/bc_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c23d3d9732e8e56daa5d5515ef18e868c6813ba
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/bc_learner.py
@@ -0,0 +1,34 @@
+import tensorflow as tf
+from typing import Dict, TYPE_CHECKING
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.tf.tf_learner import TfLearner
+from ray.rllib.core.testing.testing_learner import BaseTestingLearner
+from ray.rllib.utils.typing import ModuleID, TensorType
+
+if TYPE_CHECKING:
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+
+
+class BCTfLearner(TfLearner, BaseTestingLearner):
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: "AlgorithmConfig",
+        batch: Dict,
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        BaseTestingLearner.compute_loss_for_module(
+            self,
+            module_id=module_id,
+            config=config,
+            batch=batch,
+            fwd_out=fwd_out,
+        )
+        action_dist_inputs = fwd_out[Columns.ACTION_DIST_INPUTS]
+        action_dist_class = self._module[module_id].get_train_action_dist_cls()
+        action_dist = action_dist_class.from_logits(action_dist_inputs)
+        loss = -tf.math.reduce_mean(action_dist.logp(batch[Columns.ACTIONS]))
+
+        return loss
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/bc_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/bc_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebe0cfe361be021eff3c027bd54e4cd7e3be6620
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/tf/bc_module.py
@@ -0,0 +1,101 @@
+import tensorflow as tf
+from typing import Any, Dict
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import StateDict
+
+
+class DiscreteBCTFModule(TfRLModule):
+    def setup(self):
+        input_dim = self.observation_space.shape[0]
+        hidden_dim = self.model_config["fcnet_hiddens"][0]
+        output_dim = self.action_space.n
+        layers = []
+
+        layers.append(tf.keras.Input(shape=(input_dim,)))
+        layers.append(tf.keras.layers.ReLU())
+        layers.append(tf.keras.layers.Dense(hidden_dim))
+        layers.append(tf.keras.layers.ReLU())
+        layers.append(tf.keras.layers.Dense(output_dim))
+
+        self.policy = tf.keras.Sequential(layers)
+        self._input_dim = input_dim
+
+    def _forward(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        action_logits = self.policy(batch["obs"])
+        return {Columns.ACTION_DIST_INPUTS: action_logits}
+
+    @override(RLModule)
+    def get_state(self, *args, **kwargs) -> StateDict:
+        return {"policy": self.policy.get_weights()}
+
+    @override(RLModule)
+    def set_state(self, state: StateDict) -> None:
+        self.policy.set_weights(state["policy"])
+
+
+class BCTfRLModuleWithSharedGlobalEncoder(TfRLModule):
+    def __init__(self, encoder, local_dim, hidden_dim, action_dim):
+        super().__init__()
+
+        self.encoder = encoder
+        self.policy_head = tf.keras.Sequential(
+            [
+                tf.keras.layers.Dense(
+                    hidden_dim + local_dim,
+                    input_shape=(hidden_dim + local_dim,),
+                    activation="relu",
+                ),
+                tf.keras.layers.Dense(hidden_dim, activation="relu"),
+                tf.keras.layers.Dense(action_dim),
+            ]
+        )
+
+    def _forward(self, batch, **kwargs):
+        obs = batch["obs"]
+        global_enc = self.encoder(obs["global"])
+        policy_in = tf.concat([global_enc, obs["local"]], axis=-1)
+        action_logits = self.policy_head(policy_in)
+
+        return {Columns.ACTION_DIST_INPUTS: action_logits}
+
+    @override(RLModule)
+    def _default_input_specs(self):
+        return [("obs", "global"), ("obs", "local")]
+
+
+class BCTfMultiAgentModuleWithSharedEncoder(MultiRLModule):
+    def setup(self):
+        # constructing the global encoder based on the observation_space of the first
+        # module
+        module_specs = self.config.modules
+        module_spec = next(iter(module_specs.values()))
+        global_dim = module_spec.observation_space["global"].shape[0]
+        hidden_dim = module_spec.model_config_dict["fcnet_hiddens"][0]
+        shared_encoder = tf.keras.Sequential(
+            [
+                tf.keras.Input(shape=(global_dim,)),
+                tf.keras.layers.ReLU(),
+                tf.keras.layers.Dense(hidden_dim),
+            ]
+        )
+
+        for module_id, module_spec in module_specs.items():
+            self._rl_modules[module_id] = module_spec.module_class(
+                encoder=shared_encoder,
+                local_dim=module_spec.observation_space["local"].shape[0],
+                hidden_dim=hidden_dim,
+                action_dim=module_spec.action_space.n,
+            )
+
+    def serialize(self):
+        # TODO (Kourosh): Implement when needed.
+        raise NotImplementedError
+
+    def deserialize(self, data):
+        # TODO (Kourosh): Implement when needed.
+        raise NotImplementedError
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12723a2604185b49fd27ce71ef792787639a8c71
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/bc_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/bc_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4fded47b834ee3ba128dafbd877231b1d0ca525
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/bc_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/bc_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/bc_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36b9fe7b47afd889f624fbeeec345417f02c87dd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/__pycache__/bc_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/bc_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/bc_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c12aee7a1ee84a0d6edf995cae8963b93527aa1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/bc_learner.py
@@ -0,0 +1,34 @@
+import torch
+from typing import Any, Dict, TYPE_CHECKING
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.core.testing.testing_learner import BaseTestingLearner
+from ray.rllib.utils.typing import ModuleID, TensorType
+
+if TYPE_CHECKING:
+    from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+
+
+class BCTorchLearner(TorchLearner, BaseTestingLearner):
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: "AlgorithmConfig",
+        batch: Dict[str, Any],
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        BaseTestingLearner.compute_loss_for_module(
+            self,
+            module_id=module_id,
+            config=config,
+            batch=batch,
+            fwd_out=fwd_out,
+        )
+        action_dist_inputs = fwd_out[Columns.ACTION_DIST_INPUTS]
+        action_dist_class = self._module[module_id].get_train_action_dist_cls()
+        action_dist = action_dist_class.from_logits(action_dist_inputs)
+        loss = -torch.mean(action_dist.logp(batch[Columns.ACTIONS]))
+
+        return loss
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/bc_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/bc_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2a5d71c5c160ff696d9552a26ec9d95d35aedd4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/core/testing/torch/bc_module.py
@@ -0,0 +1,162 @@
+from typing import Any, Dict
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.models.torch.torch_distributions import TorchCategorical
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
+from ray.rllib.core.models.specs.typing import SpecType
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+
+torch, nn = try_import_torch()
+
+
+class DiscreteBCTorchModule(TorchRLModule):
+    def setup(self):
+        input_dim = self.observation_space.shape[0]
+        hidden_dim = self.model_config["fcnet_hiddens"][0]
+        output_dim = self.action_space.n
+
+        self.policy = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_dim),
+        )
+
+        self.input_dim = input_dim
+
+    def get_train_action_dist_cls(self):
+        return TorchCategorical
+
+    def get_exploration_action_dist_cls(self):
+        return TorchCategorical
+
+    def get_inference_action_dist_cls(self):
+        return TorchCategorical
+
+    @override(RLModule)
+    def output_specs_exploration(self) -> SpecType:
+        return [Columns.ACTION_DIST_INPUTS]
+
+    @override(RLModule)
+    def output_specs_inference(self) -> SpecType:
+        return [Columns.ACTION_DIST_INPUTS]
+
+    @override(RLModule)
+    def output_specs_train(self) -> SpecType:
+        return [Columns.ACTION_DIST_INPUTS]
+
+    @override(RLModule)
+    def _forward_inference(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self._forward_train(batch)
+
+    @override(RLModule)
+    def _forward_exploration(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self._forward_train(batch)
+
+    @override(RLModule)
+    def _forward_train(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        action_logits = self.policy(batch["obs"])
+        return {Columns.ACTION_DIST_INPUTS: action_logits}
+
+
+class BCTorchRLModuleWithSharedGlobalEncoder(TorchRLModule):
+    """An example of an RLModule that uses an encoder shared with other things.
+
+    For example, we could consider a multi-agent case where for inference each agent
+    needs to know the global state of the environment, as well as the local state of
+    itself. For better representation learning we would like to share the encoder
+    across all the modules. So this module simply accepts the encoder object as its
+    input argument and uses it to encode the global state. The local state is passed
+    through as is. The policy head is then a simple MLP that takes the concatenation of
+    the global and local state as input and outputs the action logits.
+
+    """
+
+    def __init__(
+        self,
+        encoder: nn.Module,
+        local_dim: int,
+        hidden_dim: int,
+        action_dim: int,
+        config=None,
+    ) -> None:
+        super().__init__(config=config)
+
+        self.encoder = encoder
+        self.policy_head = nn.Sequential(
+            nn.Linear(hidden_dim + local_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, action_dim),
+        )
+
+    def get_train_action_dist_cls(self):
+        return TorchCategorical
+
+    def get_exploration_action_dist_cls(self):
+        return TorchCategorical
+
+    def get_inference_action_dist_cls(self):
+        return TorchCategorical
+
+    @override(RLModule)
+    def _default_input_specs(self):
+        return [("obs", "global"), ("obs", "local")]
+
+    @override(RLModule)
+    def _forward_inference(self, batch):
+        with torch.no_grad():
+            return self._common_forward(batch)
+
+    @override(RLModule)
+    def _forward_exploration(self, batch):
+        with torch.no_grad():
+            return self._common_forward(batch)
+
+    @override(RLModule)
+    def _forward_train(self, batch):
+        return self._common_forward(batch)
+
+    def _common_forward(self, batch):
+        obs = batch["obs"]
+        global_enc = self.encoder(obs["global"])
+        policy_in = torch.cat([global_enc, obs["local"]], dim=-1)
+        action_logits = self.policy_head(policy_in)
+
+        return {Columns.ACTION_DIST_INPUTS: action_logits}
+
+
+class BCTorchMultiAgentModuleWithSharedEncoder(MultiRLModule):
+    def setup(self):
+        module_specs = self.config.modules
+        module_spec = next(iter(module_specs.values()))
+        global_dim = module_spec.observation_space["global"].shape[0]
+        hidden_dim = module_spec.model_config_dict["fcnet_hiddens"][0]
+        shared_encoder = nn.Sequential(
+            nn.Linear(global_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+        )
+
+        rl_modules = {}
+        for module_id, module_spec in module_specs.items():
+            rl_modules[module_id] = module_spec.module_class(
+                config=self.config.modules[module_id].get_rl_module_config(),
+                encoder=shared_encoder,
+                local_dim=module_spec.observation_space["local"].shape[0],
+                hidden_dim=hidden_dim,
+                action_dim=module_spec.action_space.n,
+            )
+
+        self._rl_modules = rl_modules
+
+    def serialize(self):
+        # TODO (Kourosh): Implement when needed.
+        raise NotImplementedError
+
+    def deserialize(self, data):
+        # TODO (Kourosh): Implement when needed.
+        raise NotImplementedError
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..09dfbe227e5a6c29bdfa7096758d529ddcb72d55
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__init__.py
@@ -0,0 +1,124 @@
+import logging
+from typing import Type, Union
+
+import gymnasium as gym
+
+from ray.rllib.env.env_context import EnvContext
+from ray.rllib.utils.error import (
+    ERR_MSG_INVALID_ENV_DESCRIPTOR,
+    EnvError,
+)
+from ray.util.annotations import PublicAPI
+
+
+logger = logging.getLogger(__name__)
+
+
+@PublicAPI
+def try_import_pyspiel(error: bool = False):
+    """Tries importing pyspiel and returns the module (or None).
+
+    Args:
+        error: Whether to raise an error if pyspiel cannot be imported.
+
+    Returns:
+        The pyspiel module.
+
+    Raises:
+        ImportError: If error=True and pyspiel is not installed.
+    """
+    try:
+        import pyspiel
+
+        return pyspiel
+    except ImportError:
+        if error:
+            raise ImportError(
+                "Could not import pyspiel! Pygame is not a dependency of RLlib "
+                "and RLlib requires you to install pygame separately: "
+                "`pip install pygame`."
+            )
+        return None
+
+
+@PublicAPI
+def try_import_open_spiel(error: bool = False):
+    """Tries importing open_spiel and returns the module (or None).
+
+    Args:
+        error: Whether to raise an error if open_spiel cannot be imported.
+
+    Returns:
+        The open_spiel module.
+
+    Raises:
+        ImportError: If error=True and open_spiel is not installed.
+    """
+    try:
+        import open_spiel
+
+        return open_spiel
+    except ImportError:
+        if error:
+            raise ImportError(
+                "Could not import open_spiel! open_spiel is not a dependency of RLlib "
+                "and RLlib requires you to install open_spiel separately: "
+                "`pip install open_spiel`."
+            )
+        return None
+
+
+def _gym_env_creator(
+    env_context: EnvContext,
+    env_descriptor: Union[str, Type[gym.Env]],
+) -> gym.Env:
+    """Tries to create a gym env given an EnvContext object and descriptor.
+
+    Note: This function tries to construct the env from a string descriptor
+    only using possibly installed RL env packages (such as gym, pybullet_envs,
+    etc). These packages are no installation requirements for RLlib. In case
+    you would like to support more such env packages, add the necessary imports
+    and construction logic below.
+
+    Args:
+        env_context: The env context object to configure the env.
+            Note that this is a config dict, plus the properties:
+            `worker_index`, `vector_index`, and `remote`.
+        env_descriptor: The env descriptor as a gym-registered string, e.g. CartPole-v1,
+            ALE/MsPacman-v5, or CartPoleContinuousBulletEnv-v0.
+            Alternatively, the gym.Env subclass to use.
+
+    Returns:
+        The actual gym environment object.
+
+    Raises:
+        gym.error.Error: If the env cannot be constructed.
+    """
+    # Allow for PyBullet or envs to be used as well (via string). This allows
+    # for doing things like `env=CartPoleContinuousBulletEnv-v0`.
+    try:
+        import pybullet_envs
+
+        pybullet_envs.getList()
+    except (AttributeError, ModuleNotFoundError, ImportError):
+        pass
+
+    # If env descriptor is a str, starting with "ale_py:ALE/", for now, register all ALE
+    # envs from ale_py.
+    if isinstance(env_descriptor, str) and env_descriptor.startswith("ale_py:ALE/"):
+        import ale_py
+
+        gym.register_envs(ale_py)
+
+    # Try creating a gym env. If this fails we can output a
+    # decent error message.
+    try:
+        # If class provided, call constructor directly.
+        if isinstance(env_descriptor, type):
+            env = env_descriptor(env_context)
+        else:
+            env = gym.make(env_descriptor, **env_context)
+    except gym.error.Error:
+        raise EnvError(ERR_MSG_INVALID_ENV_DESCRIPTOR.format(env_descriptor))
+
+    return env
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb91dc3f844bba4105a15ba277e1d3b148f32d2d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__pycache__/infinite_lookback_buffer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__pycache__/infinite_lookback_buffer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f821a2d8561a57e8ccc8f9805f81164a17ebbfeb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/__pycache__/infinite_lookback_buffer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/external_env_protocol.py b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/external_env_protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..0234d273470fcce32fbc43c391331b4eee6185e6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/external_env_protocol.py
@@ -0,0 +1,45 @@
+from enum import Enum
+
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class RLlink(Enum):
+    # Requests: Client (external env) -> Server (RLlib).
+    # ----
+    # Ping command (initial handshake).
+    PING = "PING"
+    # List of episodes (similar to what an EnvRunner.sample() call would return).
+    EPISODES = "EPISODES"
+    # Request state (e.g. model weights).
+    GET_STATE = "GET_STATE"
+    # Request (relevant) config.
+    GET_CONFIG = "GET_CONFIG"
+    # Send episodes and request the next state update right after that.
+    # Clients sending this message should wait for a SET_STATE message as an immediate
+    # response. Useful for external samplers that must collect on-policy data.
+    EPISODES_AND_GET_STATE = "EPISODES_AND_GET_STATE"
+
+    # Responses: Server (RLlib) -> Client (external env).
+    # ----
+    # Pong response (initial handshake).
+    PONG = "PONG"
+    # Set state (e.g. model weights).
+    SET_STATE = "SET_STATE"
+    # Set (relevant) config.
+    SET_CONFIG = "SET_CONFIG"
+
+    # @OldAPIStack (to be deprecated soon).
+    ACTION_SPACE = "ACTION_SPACE"
+    OBSERVATION_SPACE = "OBSERVATION_SPACE"
+    GET_WORKER_ARGS = "GET_WORKER_ARGS"
+    GET_WEIGHTS = "GET_WEIGHTS"
+    REPORT_SAMPLES = "REPORT_SAMPLES"
+    START_EPISODE = "START_EPISODE"
+    GET_ACTION = "GET_ACTION"
+    LOG_ACTION = "LOG_ACTION"
+    LOG_RETURNS = "LOG_RETURNS"
+    END_EPISODE = "END_EPISODE"
+
+    def __str__(self):
+        return self.name
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/infinite_lookback_buffer.py b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/infinite_lookback_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..26f76fbc31aef80b6a9d09ec17b2046b0b13fc2e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/env/utils/infinite_lookback_buffer.py
@@ -0,0 +1,719 @@
+from typing import Any, Dict, List, Optional, Union
+
+import gymnasium as gym
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.utils.numpy import LARGE_INTEGER, one_hot, one_hot_multidiscrete
+from ray.rllib.utils.serialization import gym_space_from_dict, gym_space_to_dict
+from ray.rllib.utils.spaces.space_utils import (
+    batch,
+    from_jsonable_if_needed,
+    get_dummy_batch_for_space,
+    get_base_struct_from_space,
+    to_jsonable_if_needed,
+)
+
+
+class InfiniteLookbackBuffer:
+    @property
+    def space(self):
+        return self._space
+
+    @space.setter
+    def space(self, value):
+        self._space = value
+        self.space_struct = get_base_struct_from_space(value)
+
+    def __init__(
+        self,
+        data: Optional[Union[List, np.ndarray]] = None,
+        lookback: int = 0,
+        space: Optional[gym.Space] = None,
+    ):
+        self.data = data if data is not None else []
+        self.lookback = min(lookback, len(self.data))
+        self.finalized = not isinstance(self.data, list)
+        self.space_struct = None
+        self.space = space
+
+    def __eq__(
+        self,
+        other: "InfiniteLookbackBuffer",
+    ) -> bool:
+        """Compares two `InfiniteLookbackBuffers.
+
+        Args:
+            other: Another object. If another `LookbackBuffer` instance all
+                their attributes are compared.
+
+        Returns:
+            `True`, if `other` is an `InfiniteLookbackBuffer` instance and all
+            attributes are identical. Otherwise, returns `False`.
+        """
+        if isinstance(other, InfiniteLookbackBuffer):
+            if (
+                self.data == other.data
+                and self.lookback == other.lookback
+                and self.finalized == other.finalized
+                and self.space_struct == other.space_struct
+                and self.space == other.space
+            ):
+                return True
+        return False
+
+    def get_state(self) -> Dict[str, Any]:
+        """Returns the pickable state of a buffer.
+
+        The data in the buffer is stored into a dictionary. Note that
+        buffers can also be generated from pickable states (see
+        `InfiniteLookbackBuffer.from_state`)
+
+        Returns:
+            A dict containing all the data and metadata from the buffer.
+        """
+        return {
+            "data": to_jsonable_if_needed(self.data, self.space)
+            if self.space
+            else self.data,
+            "lookback": self.lookback,
+            "finalized": self.finalized,
+            "space": gym_space_to_dict(self.space) if self.space else self.space,
+        }
+
+    @staticmethod
+    def from_state(state: Dict[str, Any]) -> None:
+        """Creates a new `InfiniteLookbackBuffer` from a state dict.
+
+        Args:
+            state: The state dict, as returned by `self.get_state`.
+
+        Returns:
+            A new `InfiniteLookbackBuffer` instance with the data and metadata
+            from the state dict.
+        """
+        buffer = InfiniteLookbackBuffer()
+        buffer.lookback = state["lookback"]
+        buffer.finalized = state["finalized"]
+        buffer.space = gym_space_from_dict(state["space"]) if state["space"] else None
+        buffer.space_struct = (
+            get_base_struct_from_space(buffer.space) if buffer.space else None
+        )
+        buffer.data = (
+            from_jsonable_if_needed(state["data"], buffer.space)
+            if buffer.space
+            else state["data"]
+        )
+
+        return buffer
+
+    def append(self, item) -> None:
+        """Appends the given item to the end of this buffer."""
+        if self.finalized:
+            self.data = tree.map_structure(
+                lambda d, i: np.concatenate([d, [i]], axis=0), self.data, item
+            )
+        else:
+            self.data.append(item)
+
+    def extend(self, items) -> None:
+        """Appends all items in `items` to the end of this buffer."""
+        if self.finalized:
+            # TODO (sven): When extending with a list of structs, we should
+            #  probably rather do: `tree.map_structure(..., self.data,
+            #  tree.map_structure(lambda *s: np.array(*s), *items)`)??
+            self.data = tree.map_structure(
+                lambda d, i: np.concatenate([d, i], axis=0),
+                self.data,
+                # Note, we could have dictionaries here.
+                np.array(items) if isinstance(items, list) else items,
+            )
+        else:
+            for item in items:
+                self.append(item)
+
+    def concat(self, other: "InfiniteLookbackBuffer") -> None:
+        """Concatenates the data of `other` (w/o its lookback) to `self`.
+
+        Args:
+            other: The other InfiniteLookbackBuffer to be concatenated to self.
+        """
+        self.data.extend(other.get())
+
+    def pop(self, index: int = -1) -> None:
+        """Removes the item at `index` from this buffer, but does NOT return it.
+
+        Args:
+            index: The index to pop out of this buffer (w/o returning it from this
+                method).
+        """
+        if self.finalized:
+            self.data = tree.map_structure(
+                lambda s: np.delete(s, index, axis=0), self.data
+            )
+        else:
+            self.data.pop(index)
+
+    def finalize(self) -> None:
+        """Finalizes this buffer by converting internal data lists into numpy arrays.
+
+        Thereby, if the individual items in the list are nested structures, the
+        resulting buffer content will be a nested struct of np.ndarrays (leafs).
+        """
+        if not self.finalized:
+            self.data = batch(self.data)
+            self.finalized = True
+
+    def get(
+        self,
+        indices: Optional[Union[int, slice, List[int]]] = None,
+        *,
+        neg_index_as_lookback: bool = False,
+        fill: Optional[Any] = None,
+        one_hot_discrete: bool = False,
+        _ignore_last_ts: bool = False,
+        _add_last_ts_value: Optional[Any] = None,
+    ) -> Any:
+        """Returns data, based on the given args, from this buffer.
+
+        Args:
+            indices: A single int is interpreted as an index, from which to return the
+                individual data stored at this index.
+                A list of ints is interpreted as a list of indices from which to gather
+                individual data in a batch of size len(indices).
+                A slice object is interpreted as a range of data to be returned.
+                Thereby, negative indices by default are interpreted as "before the end"
+                unless the `neg_index_as_lookback=True` option is used, in which case
+                negative indices are interpreted as "before ts=0", meaning going back
+                into the lookback buffer.
+            neg_index_as_lookback: If True, negative values in `indices` are
+                interpreted as "before ts=0", meaning going back into the lookback
+                buffer. For example, a buffer with data [4, 5, 6,  7, 8, 9],
+                where [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will
+                respond to `get(-1, neg_index_as_lookback=True)` with `6` and to
+                `get(slice(-2, 1), neg_index_as_lookback=True)` with `[5, 6,  7]`.
+            fill: An optional float value to use for filling up the returned results at
+                the boundaries. This filling only happens if the requested index range's
+                start/stop boundaries exceed the buffer's boundaries (including the
+                lookback buffer on the left side). This comes in very handy, if users
+                don't want to worry about reaching such boundaries and want to zero-pad.
+                For example, a buffer with data [10, 11,  12, 13, 14] and lookback
+                buffer size of 2 (meaning `10` and `11` are part of the lookback buffer)
+                will respond to `get(slice(-7, -2), fill=0.0)`
+                with `[0.0, 0.0, 10, 11, 12]`.
+            one_hot_discrete: If True, will return one-hot vectors (instead of
+                int-values) for those sub-components of a (possibly complex) space
+                that are Discrete or MultiDiscrete. Note that if `fill=0` and the
+                requested `indices` are out of the range of our data, the returned
+                one-hot vectors will actually be zero-hot (all slots zero).
+            _ignore_last_ts: Whether to ignore the last record in our internal
+                `self.data` when getting the provided indices.
+            _add_last_ts_value: Whether to add the value of this arg to the end of
+                the internal `self.data` buffer (just for the duration of this get
+                operation, not permanently).
+        """
+        if indices is None:
+            data = self._get_all_data(
+                one_hot_discrete=one_hot_discrete,
+                _ignore_last_ts=_ignore_last_ts,
+            )
+        elif isinstance(indices, slice):
+            data = self._get_slice(
+                indices,
+                fill=fill,
+                neg_index_as_lookback=neg_index_as_lookback,
+                one_hot_discrete=one_hot_discrete,
+                _ignore_last_ts=_ignore_last_ts,
+                _add_last_ts_value=_add_last_ts_value,
+            )
+        elif isinstance(indices, list):
+            data = [
+                self._get_int_index(
+                    idx,
+                    fill=fill,
+                    neg_index_as_lookback=neg_index_as_lookback,
+                    one_hot_discrete=one_hot_discrete,
+                    _ignore_last_ts=_ignore_last_ts,
+                    _add_last_ts_value=_add_last_ts_value,
+                )
+                for idx in indices
+            ]
+            if self.finalized:
+                data = batch(data)
+        else:
+            assert isinstance(indices, int)
+            data = self._get_int_index(
+                indices,
+                fill=fill,
+                neg_index_as_lookback=neg_index_as_lookback,
+                one_hot_discrete=one_hot_discrete,
+                _ignore_last_ts=_ignore_last_ts,
+                _add_last_ts_value=_add_last_ts_value,
+            )
+
+        return data
+
+    def __add__(
+        self, other: Union[List, "InfiniteLookbackBuffer", int, float, complex]
+    ) -> "InfiniteLookbackBuffer":
+        """Adds another InfiniteLookbackBuffer object or list to the end of this one.
+
+        Args:
+            other: Another `InfiniteLookbackBuffer` or a `list` or a number.
+                If a `InfiniteLookbackBuffer` its data (w/o its lookback buffer) gets
+                concatenated to self's data. If a `list`, we concat it to self's data.
+                If a number, we add this number to each element of self (if possible).
+
+        Returns:
+            A new `InfiniteLookbackBuffer` instance `self.data` containing
+            concatenated data from `self` and `other` (or adding `other` to each element
+            in self's data).
+        """
+
+        if self.finalized:
+            raise RuntimeError(f"Cannot `add` to a finalized {type(self).__name__}.")
+        else:
+            # If `other` is an int, simply add it to all our values (if possible) and
+            # use the result as the underlying data for the returned buffer.
+            if isinstance(other, (int, float, complex)):
+                data = [
+                    (d + other) if isinstance(d, (int, float, complex)) else d
+                    for d in self.data
+                ]
+            # If `other` is a InfiniteLookbackBuffer itself, do NOT include its
+            # lookback buffer anymore. We assume that `other`'s lookback buffer i
+            # already at the end of `self`.
+            elif isinstance(other, InfiniteLookbackBuffer):
+                data = self.data + other.data[other.lookback :]
+            # `other` is a list, simply concat the two lists and use the result as
+            # the underlying data for the returned buffer.
+            else:
+                data = self.data + other
+
+            return InfiniteLookbackBuffer(
+                data=data,
+                lookback=self.lookback,
+                space=self.space,
+            )
+
+    def __getitem__(self, item):
+        """Support squared bracket syntax, e.g. buffer[:5]."""
+        return self.get(item)
+
+    def __setitem__(self, key, value):
+        self.set(new_data=value, at_indices=key)
+
+    def set(
+        self,
+        new_data,
+        *,
+        at_indices: Optional[Union[int, slice, List[int]]] = None,
+        neg_index_as_lookback: bool = False,
+    ) -> None:
+        """Overwrites all or some of the data in this buffer with the provided data.
+
+        Args:
+            new_data: The new data to overwrite existing records with.
+            at_indices: A single int is interpreted as an index, at which to overwrite
+                the individual record stored at this index with `new_data`.
+                A list of ints is interpreted as a list of indices, which to overwrite
+                with `new_data`, which must be a batch of size `len(at_indices)`.
+                A slice object is interpreted as a range, which to overwrite with
+                `new_data`. Thereby, negative indices by default are interpreted as
+                "before the end" unless the `neg_index_as_lookback=True` option is
+                used, in which case negative indices are interpreted as
+                "before ts=0", meaning going back into the lookback buffer.
+            neg_index_as_lookback: If True, negative values in `at_indices` are
+                interpreted as "before ts=0", meaning going back into the lookback
+                buffer. For example, a buffer with data [4, 5, 6,  7, 8, 9],
+                where [4, 5, 6] is the lookback buffer range (ts=0 item is 7), will
+                handle a call `set(99, at_indices=-1, neg_index_as_lookback=True)`
+                with `6` being replaced by 99 and to `set([98, 99, 100],
+                at_indices=slice(-2, 1), neg_index_as_lookback=True)` with
+                `[5, 6,  7]` being replaced by `[98, 99,  100]`.
+        """
+        # `at_indices` is None -> Override all our data (excluding the lookback buffer).
+        if at_indices is None:
+            self._set_all_data(new_data)
+
+        elif isinstance(at_indices, slice):
+            self._set_slice(
+                new_data,
+                slice_=at_indices,
+                neg_index_as_lookback=neg_index_as_lookback,
+            )
+        elif isinstance(at_indices, list):
+            for i, idx in enumerate(at_indices):
+                self._set_int_index(
+                    new_data[i],
+                    idx=idx,
+                    neg_index_as_lookback=neg_index_as_lookback,
+                )
+        else:
+            assert isinstance(at_indices, int)
+            self._set_int_index(
+                new_data,
+                idx=at_indices,
+                neg_index_as_lookback=neg_index_as_lookback,
+            )
+
+    def __len__(self):
+        """Return the length of our data, excluding the lookback buffer."""
+        len_ = self.len_incl_lookback()
+        # Only count the data after the lookback.
+        return max(len_ - self.lookback, 0)
+
+    def len_incl_lookback(self):
+        if self.finalized:
+            return len(tree.flatten(self.data)[0])
+        else:
+            return len(self.data)
+
+    def __repr__(self):
+        return (
+            f"{type(self).__name__}({self.data[:self.lookback]} <- "
+            f"lookback({self.lookback}) | {self.data[self.lookback:]})"
+        )
+
+    def _get_all_data(self, one_hot_discrete=False, _ignore_last_ts=False):
+        data = self[: (None if not _ignore_last_ts else -1)]
+        if one_hot_discrete:
+            data = self._one_hot(data, space_struct=self.space_struct)
+        return data
+
+    def _set_all_data(self, new_data):
+        self._set_slice(new_data, slice(0, None))
+
+    def _get_slice(
+        self,
+        slice_,
+        fill=None,
+        neg_index_as_lookback=False,
+        one_hot_discrete=False,
+        _ignore_last_ts=False,
+        _add_last_ts_value=None,
+    ):
+        data_to_use = self.data
+        if _ignore_last_ts:
+            if self.finalized:
+                data_to_use = tree.map_structure(lambda s: s[:-1], self.data)
+            else:
+                data_to_use = self.data[:-1]
+        if _add_last_ts_value is not None:
+            if self.finalized:
+                data_to_use = tree.map_structure(
+                    lambda s, t: np.append(s, t),
+                    data_to_use.copy(),
+                    _add_last_ts_value,
+                )
+            else:
+                data_to_use = np.append(data_to_use.copy(), _add_last_ts_value)
+
+        slice_, slice_len, fill_left_count, fill_right_count = self._interpret_slice(
+            slice_,
+            neg_index_as_lookback,
+            len_self_plus_lookback=(
+                self.len_incl_lookback()
+                + int(_add_last_ts_value is not None)
+                - int(_ignore_last_ts)
+            ),
+        )
+
+        # Perform the actual slice.
+        data_slice = None
+        if slice_len > 0:
+            if self.finalized:
+                data_slice = tree.map_structure(lambda s: s[slice_], data_to_use)
+            else:
+                data_slice = data_to_use[slice_]
+
+            if one_hot_discrete:
+                data_slice = self._one_hot(data_slice, space_struct=self.space_struct)
+
+        # Data is shorter than the range requested -> Fill the rest with `fill` data.
+        if fill is not None and (fill_right_count > 0 or fill_left_count > 0):
+            if self.finalized:
+                if fill_left_count:
+                    if self.space is None:
+                        fill_batch = np.array([fill] * fill_left_count)
+                    else:
+                        fill_batch = get_dummy_batch_for_space(
+                            self.space,
+                            fill_value=fill,
+                            batch_size=fill_left_count,
+                            one_hot_discrete=one_hot_discrete,
+                        )
+                    if data_slice is not None:
+                        data_slice = tree.map_structure(
+                            lambda s0, s: np.concatenate([s0, s]),
+                            fill_batch,
+                            data_slice,
+                        )
+                    else:
+                        data_slice = fill_batch
+                if fill_right_count:
+                    if self.space is None:
+                        fill_batch = np.array([fill] * fill_right_count)
+                    else:
+                        fill_batch = get_dummy_batch_for_space(
+                            self.space,
+                            fill_value=fill,
+                            batch_size=fill_right_count,
+                            one_hot_discrete=one_hot_discrete,
+                        )
+                    if data_slice is not None:
+                        data_slice = tree.map_structure(
+                            lambda s0, s: np.concatenate([s, s0]),
+                            fill_batch,
+                            data_slice,
+                        )
+                    else:
+                        data_slice = fill_batch
+
+            else:
+                if self.space is None:
+                    fill_batch = [fill]
+                else:
+                    fill_batch = [
+                        get_dummy_batch_for_space(
+                            self.space,
+                            fill_value=fill,
+                            batch_size=0,
+                            one_hot_discrete=one_hot_discrete,
+                        )
+                    ]
+                data_slice = (
+                    fill_batch * fill_left_count
+                    + (data_slice if data_slice is not None else [])
+                    + fill_batch * fill_right_count
+                )
+
+        if data_slice is None:
+            if self.finalized:
+                return tree.map_structure(lambda s: s[slice_], data_to_use)
+            else:
+                return data_to_use[slice_]
+        return data_slice
+
+    def _set_slice(
+        self,
+        new_data,
+        slice_,
+        neg_index_as_lookback=False,
+    ):
+        slice_, _, _, _ = self._interpret_slice(slice_, neg_index_as_lookback)
+
+        # Check, whether the setting to new_data changes the length of self
+        # (it shouldn't). If it does, raise an error.
+        try:
+            if self.finalized:
+
+                def __set(s, n):
+                    if self.space:
+                        assert self.space.contains(n[0])
+                    assert len(s[slice_]) == len(n)
+                    s[slice_] = n
+
+                tree.map_structure(__set, self.data, new_data)
+            else:
+                assert len(self.data[slice_]) == len(new_data)
+                self.data[slice_] = new_data
+        except AssertionError:
+            raise IndexError(
+                f"Cannot `set()` value via at_indices={slice_} (option "
+                f"neg_index_as_lookback={neg_index_as_lookback})! Slice of data "
+                "does NOT have the same size as `new_data`."
+            )
+
+    def _get_int_index(
+        self,
+        idx: int,
+        fill=None,
+        neg_index_as_lookback=False,
+        one_hot_discrete=False,
+        _ignore_last_ts=False,
+        _add_last_ts_value=None,
+    ):
+        data_to_use = self.data
+        if _ignore_last_ts:
+            if self.finalized:
+                data_to_use = tree.map_structure(lambda s: s[:-1], self.data)
+            else:
+                data_to_use = self.data[:-1]
+        if _add_last_ts_value is not None:
+            if self.finalized:
+                data_to_use = tree.map_structure(
+                    lambda s, last: np.append(s, last), data_to_use, _add_last_ts_value
+                )
+            else:
+                data_to_use = data_to_use.copy()
+                data_to_use.append(_add_last_ts_value)
+
+        # If index >= 0 -> Ignore lookback buffer.
+        # Otherwise, include lookback buffer.
+        if idx >= 0 or neg_index_as_lookback:
+            idx = self.lookback + idx
+        # Negative indices mean: Go to left into lookback buffer starting from idx=0.
+        # But if we pass the lookback buffer, the index should be invalid and we will
+        # have to fill, if required. Invalidate the index by setting it to one larger
+        # than max.
+        if neg_index_as_lookback and idx < 0:
+            idx = len(self) + self.lookback - (_ignore_last_ts is True)
+
+        try:
+            if self.finalized:
+                data = tree.map_structure(lambda s: s[idx], data_to_use)
+            else:
+                data = data_to_use[idx]
+        # Out of range index -> If `fill`, use a fill dummy (B=0), if not, error out.
+        except IndexError as e:
+            if fill is not None:
+                if self.space is None:
+                    return fill
+                return get_dummy_batch_for_space(
+                    self.space,
+                    fill_value=fill,
+                    batch_size=0,
+                    one_hot_discrete=one_hot_discrete,
+                )
+            else:
+                raise e
+
+        # Convert discrete/multi-discrete components to one-hot vectors, if required.
+        if one_hot_discrete:
+            data = self._one_hot(data, self.space_struct)
+        return data
+
+    def _set_int_index(self, new_data, idx, neg_index_as_lookback):
+        actual_idx = idx
+        # If index >= 0 -> Ignore lookback buffer.
+        # Otherwise, include lookback buffer.
+        if actual_idx >= 0 or neg_index_as_lookback:
+            actual_idx = self.lookback + actual_idx
+        # Negative indices mean: Go to left into lookback buffer starting from idx=0.
+        # But if we pass the lookback buffer, the index should be invalid and we will
+        # have to fill, if required. Invalidate the index by setting it to one larger
+        # than max.
+        if neg_index_as_lookback and actual_idx < 0:
+            actual_idx = len(self) + self.lookback
+
+        try:
+            if self.finalized:
+
+                def __set(s, n):
+                    if self.space:
+                        assert self.space.contains(n), n
+                    s[actual_idx] = n
+
+                tree.map_structure(__set, self.data, new_data)
+            else:
+                self.data[actual_idx] = new_data
+        except IndexError:
+            raise IndexError(
+                f"Cannot `set()` value at index {idx} (option "
+                f"neg_index_as_lookback={neg_index_as_lookback})! Out of range "
+                f"of buffer data."
+            )
+
+    def _interpret_slice(
+        self,
+        slice_,
+        neg_index_as_lookback,
+        len_self_plus_lookback=None,
+    ):
+        if len_self_plus_lookback is None:
+            len_self_plus_lookback = len(self) + self.lookback
+
+        # Re-interpret slice bounds as absolute positions (>=0) within our
+        # internal data.
+        start = slice_.start
+        stop = slice_.stop
+
+        # Start is None -> Exclude lookback buffer.
+        if start is None:
+            start = self.lookback
+        # Start is negative.
+        elif start < 0:
+            # `neg_index_as_lookback=True` -> User wants to index into the lookback
+            # range.
+            if neg_index_as_lookback:
+                start = self.lookback + start
+            # Interpret index as counting "from end".
+            else:
+                start = len_self_plus_lookback + start
+        # Start is 0 or positive -> timestep right after lookback is interpreted as 0.
+        else:
+            start = self.lookback + start
+
+        # Stop is None -> Set stop to very last index + 1 of our internal data.
+        if stop is None:
+            stop = len_self_plus_lookback
+        # Stop is negative.
+        elif stop < 0:
+            # `neg_index_as_lookback=True` -> User wants to index into the lookback
+            # range. Set to 0 (beginning of lookback buffer) if result is a negative
+            # index.
+            if neg_index_as_lookback:
+                stop = self.lookback + stop
+            # Interpret index as counting "from end". Set to 0 (beginning of actual
+            # episode) if result is a negative index.
+            else:
+                stop = len_self_plus_lookback + stop
+        # Stop is positive -> Add lookback range to it.
+        else:
+            stop = self.lookback + stop
+
+        fill_left_count = fill_right_count = 0
+        # Both start and stop are on left side.
+        if start < 0 and stop < 0:
+            fill_left_count = abs(start - stop)
+            fill_right_count = 0
+            start = stop = 0
+        # Both start and stop are on right side.
+        elif start >= len_self_plus_lookback and stop >= len_self_plus_lookback:
+            fill_right_count = abs(start - stop)
+            fill_left_count = 0
+            start = stop = len_self_plus_lookback
+        # Set to 0 (beginning of actual episode) if result is a negative index.
+        elif start < 0:
+            fill_left_count = -start
+            start = 0
+        elif stop >= len_self_plus_lookback:
+            fill_right_count = stop - len_self_plus_lookback
+            stop = len_self_plus_lookback
+        # Only `stop` might be < 0, when slice has negative step and start is > 0.
+        elif stop < 0:
+            if start >= len_self_plus_lookback:
+                fill_left_count = start - len_self_plus_lookback + 1
+                start = len_self_plus_lookback - 1
+            fill_right_count = -stop - 1
+            stop = -LARGE_INTEGER
+
+        assert start >= 0 and (stop >= 0 or stop == -LARGE_INTEGER), (start, stop)
+
+        step = slice_.step if slice_.step is not None else 1
+        slice_ = slice(start, stop, step)
+        slice_len = max(0, (stop - start + (step - (1 if step > 0 else -1))) // step)
+        return slice_, slice_len, fill_left_count, fill_right_count
+
+    def _one_hot(self, data, space_struct):
+        if space_struct is None:
+            raise ValueError(
+                f"Cannot `one_hot` data in `{type(self).__name__}` if a "
+                "gym.Space was NOT provided during construction!"
+            )
+
+        def _convert(dat_, space):
+            if isinstance(space, gym.spaces.Discrete):
+                return one_hot(dat_, depth=space.n)
+            elif isinstance(space, gym.spaces.MultiDiscrete):
+                return one_hot_multidiscrete(dat_, depths=space.nvec)
+            return dat_
+
+        if isinstance(data, list):
+            data = [
+                tree.map_structure(_convert, dslice, space_struct) for dslice in data
+            ]
+        else:
+            data = tree.map_structure(_convert, data, space_struct)
+        return data
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51e7e5d56bbb060212f659660df667969fed3f0a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/dm_control_wrapper.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/dm_control_wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92d535bf0e3121899c9e955e4296ca2b8fed9ee6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/dm_control_wrapper.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/dm_env_wrapper.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/dm_env_wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..668795697bb8d792544203f305738bbb926b4931
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/dm_env_wrapper.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/pettingzoo_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/pettingzoo_env.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec5164b2df113a1cc2d9ef41ee699848de9fba5a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/env/wrappers/__pycache__/pettingzoo_env.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/neural_computer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/neural_computer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90e3fc94d1ecbee2265481bd285f8cefccdce9ab
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/neural_computer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..994bdc3cc44b6ef9fd63d35700d6a25561eb91ea
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/random_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/random_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..610bce5c1ba51f43d7361fa7407e91b445ce4d7e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/random_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/cliff_walking_wall_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/cliff_walking_wall_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9a4758f81ea427f577c6e8004e4ffeb8f3bef06
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/cliff_walking_wall_policy.py
@@ -0,0 +1,100 @@
+# @OldAPIStack
+import gymnasium as gym
+from typing import Dict, Union, List, Tuple, Optional
+import numpy as np
+
+from ray.rllib.policy.policy import Policy, ViewRequirement
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical
+from ray.rllib.utils.typing import AlgorithmConfigDict, TensorStructType, TensorType
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.debug import update_global_seed_if_necessary
+
+
+class CliffWalkingWallPolicy(Policy):
+    """Optimal RLlib policy for the CliffWalkingWallEnv environment, defined in
+    ray/rllib/examples/env/cliff_walking_wall_env.py, with epsilon-greedy exploration.
+
+    The policy takes a random action with probability epsilon, specified
+    by `config["epsilon"]`, and the optimal action with probability  1 - epsilon.
+    """
+
+    @override(Policy)
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        config: AlgorithmConfigDict,
+    ):
+        update_global_seed_if_necessary(seed=config.get("seed"))
+        super().__init__(observation_space, action_space, config)
+
+        # Known optimal action dist for each of the 48 states and 4 actions
+        self.action_dist = np.zeros((48, 4), dtype=float)
+        # Starting state: go up
+        self.action_dist[36] = (1, 0, 0, 0)
+        # Cliff + Goal: never actually used, set to random
+        self.action_dist[37:] = (0.25, 0.25, 0.25, 0.25)
+        # Row 2; always go right
+        self.action_dist[24:36] = (0, 1, 0, 0)
+        # Row 0 and Row 1; go down or go right
+        self.action_dist[0:24] = (0, 0.5, 0.5, 0)
+        # Col 11; always go down, supercedes previous values
+        self.action_dist[[11, 23, 35]] = (0, 0, 1, 0)
+        assert np.allclose(self.action_dist.sum(-1), 1)
+
+        # Epsilon-Greedy action selection
+        epsilon = config.get("epsilon", 0.0)
+        self.action_dist = self.action_dist * (1 - epsilon) + epsilon / 4
+        assert np.allclose(self.action_dist.sum(-1), 1)
+
+        # Attributes required for RLlib; note that while CliffWalkingWallPolicy
+        # inherits from Policy, it actually implements TorchPolicyV2.
+        self.view_requirements[SampleBatch.ACTION_PROB] = ViewRequirement()
+        self.device = "cpu"
+        self.model = None
+        self.dist_class = TorchCategorical
+
+    @override(Policy)
+    def compute_actions(
+        self,
+        obs_batch: Union[List[TensorStructType], TensorStructType],
+        state_batches: Optional[List[TensorType]] = None,
+        **kwargs,
+    ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
+        obs = np.array(obs_batch, dtype=int)
+        action_probs = self.action_dist[obs]
+        actions = np.zeros(len(obs), dtype=int)
+        for i in range(len(obs)):
+            actions[i] = np.random.choice(4, p=action_probs[i])
+        return (
+            actions,
+            [],
+            {SampleBatch.ACTION_PROB: action_probs[np.arange(len(obs)), actions]},
+        )
+
+    @override(Policy)
+    def compute_log_likelihoods(
+        self,
+        actions: Union[List[TensorType], TensorType],
+        obs_batch: Union[List[TensorType], TensorType],
+        **kwargs,
+    ) -> TensorType:
+        obs = np.array(obs_batch, dtype=int)
+        actions = np.array(actions, dtype=int)
+        # Compute action probs for all possible actions
+        action_probs = self.action_dist[obs]
+        # Take the action_probs corresponding to the specified actions
+        action_probs = action_probs[np.arange(len(obs)), actions]
+        # Ignore RuntimeWarning thrown by np.log(0) if action_probs is 0
+        with np.errstate(divide="ignore"):
+            return np.log(action_probs)
+
+    def action_distribution_fn(
+        self, model, obs_batch: TensorStructType, **kwargs
+    ) -> Tuple[TensorType, type, List[TensorType]]:
+        obs = np.array(obs_batch[SampleBatch.OBS], dtype=int)
+        action_probs = self.action_dist[obs]
+        # Ignore RuntimeWarning thrown by np.log(0) if action_probs is 0
+        with np.errstate(divide="ignore"):
+            return np.log(action_probs), TorchCategorical, None
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/random_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/random_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c410ba0ec464e73fe380741180e71b56c9967a5b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/random_policy.py
@@ -0,0 +1,101 @@
+# @OldAPIStack
+from gymnasium.spaces import Box
+import numpy as np
+import random
+import tree  # pip install dm_tree
+from typing import (
+    List,
+    Optional,
+    Union,
+)
+
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import ModelWeights, TensorStructType, TensorType
+
+
+class RandomPolicy(Policy):
+    """Hand-coded policy that returns random actions."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Whether for compute_actions, the bounds given in action_space
+        # should be ignored (default: False). This is to test action-clipping
+        # and any Env's reaction to bounds breaches.
+        if self.config.get("ignore_action_bounds", False) and isinstance(
+            self.action_space, Box
+        ):
+            self.action_space_for_sampling = Box(
+                -float("inf"),
+                float("inf"),
+                shape=self.action_space.shape,
+                dtype=self.action_space.dtype,
+            )
+        else:
+            self.action_space_for_sampling = self.action_space
+
+    @override(Policy)
+    def init_view_requirements(self):
+        super().init_view_requirements()
+        # Disable for_training and action attributes for SampleBatch.INFOS column
+        # since it can not be properly batched.
+        vr = self.view_requirements[SampleBatch.INFOS]
+        vr.used_for_training = False
+        vr.used_for_compute_actions = False
+
+    @override(Policy)
+    def compute_actions(
+        self,
+        obs_batch: Union[List[TensorStructType], TensorStructType],
+        state_batches: Optional[List[TensorType]] = None,
+        prev_action_batch: Union[List[TensorStructType], TensorStructType] = None,
+        prev_reward_batch: Union[List[TensorStructType], TensorStructType] = None,
+        **kwargs,
+    ):
+        # Alternatively, a numpy array would work here as well.
+        # e.g.: np.array([random.choice([0, 1])] * len(obs_batch))
+        obs_batch_size = len(tree.flatten(obs_batch)[0])
+        return (
+            [self.action_space_for_sampling.sample() for _ in range(obs_batch_size)],
+            [],
+            {},
+        )
+
+    @override(Policy)
+    def learn_on_batch(self, samples):
+        """No learning."""
+        return {}
+
+    @override(Policy)
+    def compute_log_likelihoods(
+        self,
+        actions,
+        obs_batch,
+        state_batches=None,
+        prev_action_batch=None,
+        prev_reward_batch=None,
+        **kwargs,
+    ):
+        return np.array([random.random()] * len(obs_batch))
+
+    @override(Policy)
+    def get_weights(self) -> ModelWeights:
+        """No weights to save."""
+        return {}
+
+    @override(Policy)
+    def set_weights(self, weights: ModelWeights) -> None:
+        """No weights to set."""
+        pass
+
+    @override(Policy)
+    def _get_dummy_batch_from_view_requirements(self, batch_size: int = 1):
+        return SampleBatch(
+            {
+                SampleBatch.OBS: tree.map_structure(
+                    lambda s: s[None], self.observation_space.sample()
+                ),
+            }
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/cartpole_recording.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/cartpole_recording.py
new file mode 100644
index 0000000000000000000000000000000000000000..42258ac46fe0b9236bf3008619ba83a5eee3a2ab
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/cartpole_recording.py
@@ -0,0 +1,163 @@
+"""Example showing how to record expert data from a trained policy.
+
+This example:
+    - demonstrates how you can train a single-agent expert PPO Policy (RLModule)
+    and checkpoint it.
+    - shows how you can then record expert data from the trained PPO Policy to
+    disk during evaluation.
+
+How to run this script
+----------------------
+`python [script file name].py --checkpoint-at-end`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+Results to expect
+-----------------
+In the console output you can see that the episode return of 350.0 is reached
+before the timestep stop criteria is touched. Afterwards evaluation starts and
+runs 10 iterations while recording the data. The number of recorded experiences
+might differ from evaluation run to evaluation run because evaluation
+`EnvRunner`s sample episodes while recording timesteps and episodes contain
+usually different numbers of timesteps. Note, this is different when recording
+episodes - in this case each row is one episode.
+
++-----------------------------+------------+----------------------+
+| Trial name                  | status     | loc                  |
+|                             |            |                      |
+|-----------------------------+------------+----------------------+
+| PPO_CartPole-v1_df83f_00000 | TERMINATED | 192.168.0.119:233661 |
++-----------------------------+------------+----------------------+
++--------+------------------+------------------------+------------------------+
+|   iter |   total time (s) |   num_training_step_ca |   num_env_steps_sample |
+|        |                  |      lls_per_iteration |             d_lifetime |
++--------+------------------+------------------------+------------------------|
+|     21 |          25.9162 |                      1 |                  84000 |
++--------+------------------+------------------------+------------------------+
+
+...
+
+Number of experiences recorded: 26644
+"""
+
+import ray
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core import COMPONENT_RL_MODULE
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    EVALUATION_RESULTS,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+
+parser = add_rllib_example_script_args(
+    default_timesteps=200000,
+    default_reward=350.0,
+)
+parser.set_defaults(checkpoint_at_end=True, max_concurrent_trials=1)
+# Use `parser` to add your own custom command line options to this script
+# and (if needed) use their values to set up `config` below.
+args = parser.parse_args()
+
+config = (
+    PPOConfig()
+    .env_runners(
+        num_env_runners=5,
+    )
+    .environment("CartPole-v1")
+    .rl_module(
+        model_config=DefaultModelConfig(
+            fcnet_hiddens=[32],
+            fcnet_activation="linear",
+            vf_share_layers=True,
+        ),
+    )
+    .training(
+        lr=0.0003,
+        num_epochs=6,
+        vf_loss_coeff=0.01,
+    )
+    .evaluation(
+        evaluation_num_env_runners=1,
+        evaluation_interval=1,
+        evaluation_parallel_to_training=True,
+        evaluation_config=PPOConfig.overrides(explore=False),
+    )
+)
+
+stop = {
+    f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps,
+    f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
+        args.stop_reward
+    ),
+}
+
+
+if __name__ == "__main__":
+    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
+    results = run_rllib_example_script_experiment(config, args, stop=stop)
+
+    # Store the best checkpoint for recording.
+    best_checkpoint = results.get_best_result(
+        metric=f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}",
+        mode="max",
+    ).checkpoint.path
+
+    # Configure the algorithm for offline recording.
+    config.offline_data(
+        output="local:///tmp/cartpole/",
+        # Store columnar (tabular) data.
+        output_write_episodes=False,
+        # Each file should hold 1,000 rows.
+        output_max_rows_per_file=1000,
+        output_write_remaining_data=True,
+        # LZ4-compress columns 'obs', 'new_obs', and 'actions' to
+        # save disk space and increase performance. Note, this means
+        # that you have to use `input_compress_columns` in the same
+        # way when using the data for training in `RLlib`.
+        output_compress_columns=[Columns.OBS, Columns.ACTIONS],
+    )
+    # Change the evaluation settings to sample exactly 50 episodes
+    # per evaluation iteration and increase the number of evaluation
+    # env-runners to 5.
+    config.evaluation(
+        evaluation_num_env_runners=5,
+        evaluation_duration=50,
+        evaluation_duration_unit="episodes",
+        evaluation_interval=1,
+        evaluation_parallel_to_training=False,
+        evaluation_config=PPOConfig.overrides(explore=False),
+    )
+
+    # Build the algorithm for evaluation.
+    algo = config.build()
+    # Load the checkpoint stored above.
+    algo.restore_from_path(
+        best_checkpoint,
+        component=COMPONENT_RL_MODULE,
+    )
+
+    # Evaluate over 10 iterations and record the data.
+    for i in range(10):
+        print(f"Iteration: {i + 1}:\n")
+        res = algo.evaluate()
+        print(res)
+
+    # Stop the algorithm.
+    algo.stop()
+
+    # Check the number of rows in the dataset.
+    ds = ray.data.read_parquet("local:///tmp/cartpole")
+    print(f"Number of experiences recorded: {ds.count()}")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl_with_image_data.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl_with_image_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a88aeeb323869353cc811f40e161f516aa49d44
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl_with_image_data.py
@@ -0,0 +1,112 @@
+"""Example showing how to customize an offline data pipeline.
+
+This example:
+    - demonstrates how you can customized your offline data pipeline.
+    - shows how you can override the `OfflineData` to read raw image
+    data and transform it into `numpy ` arrays.
+    - explains how you can override the `OfflinePreLearner` to
+    transform data further into `SingleAgentEpisode` instances that
+    can be processes by the learner connector pipeline.
+
+How to run this script
+----------------------
+`python [script file name].py --checkpoint-at-end`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+Results to expect
+-----------------
+2024-12-03 19:59:23,043 INFO streaming_executor.py:109 -- Execution plan
+of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] ->
+TaskPoolMapOperator[Map(map_to_numpy)] -> LimitOperator[limit=128]
+✔️  Dataset execution finished in 10.01 seconds: 100%|███████████████████
+███████████████████████████████████████████████████████████████████████|
+3.00/3.00 [00:10<00:00, 3.34s/ row]
+- ReadBinary->SplitBlocks(11): Tasks: 0; Queued blocks: 0; Resources: 0.0
+CPU, 0.0B object store: 100%|█████████████████████████████████████████|
+3.00/3.00 [00:10<00:00, 3.34s/ row]
+- Map(map_to_numpy): Tasks: 0; Queued blocks: 0; Resources: 0.0 CPU,
+0.0B object store: 100%|███████████████████████████████████████████████████|
+3.00/3.00 [00:10<00:00, 3.34s/ row]
+- limit=128: Tasks: 0; Queued blocks: 0; Resources: 0.0 CPU, 3.0KB object
+store: 100%|██████████████████████████████████████████████████████████|
+3.00/3.00 [00:10<00:00, 3.34s/ row]
+Batch: {'batch': [MultiAgentBatch({}, env_steps=3)]}
+"""
+
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.algorithms.bc import BCConfig
+from ray.rllib.algorithms.bc.bc_catalog import BCCatalog
+from ray.rllib.algorithms.bc.torch.bc_torch_rl_module import BCTorchRLModule
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec, DefaultModelConfig
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.examples.offline_rl.classes.image_offline_data import ImageOfflineData
+from ray.rllib.examples.offline_rl.classes.image_offline_prelearner import (
+    ImageOfflinePreLearner,
+)
+
+# Create an Algorithm configuration.
+# TODO: Make this an actually running/learning example with RLunplugged
+# data from S3 and add this to the CI.
+config = (
+    BCConfig()
+    .environment(
+        action_space=gym.spaces.Discrete(2),
+        observation_space=gym.spaces.Box(0, 255, (32, 32, 3), np.float32),
+    )
+    .offline_data(
+        input_=["s3://anonymous@ray-example-data/batoidea/JPEGImages/"],
+        prelearner_class=ImageOfflinePreLearner,
+    )
+)
+
+# Specify an `RLModule` and wrap it with a `MultiRLModuleSpec`. Note,
+# on `Learner`` side any `RLModule` is an `MultiRLModule`.
+module_spec = MultiRLModuleSpec(
+    rl_module_specs={
+        "default_policy": RLModuleSpec(
+            model_config=DefaultModelConfig(
+                conv_filters=[[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]],
+                conv_activation="relu",
+            ),
+            inference_only=False,
+            module_class=BCTorchRLModule,
+            catalog_class=BCCatalog,
+            action_space=gym.spaces.Discrete(2),
+            observation_space=gym.spaces.Box(0, 255, (32, 32, 3), np.float32),
+        ),
+    },
+)
+
+# Construct your `OfflineData` class instance.
+offline_data = ImageOfflineData(config)
+
+# Check, how the data is transformed. Note, the
+# example dataset has only 3 such images.
+batch = offline_data.data.take_batch(3)
+
+# Construct your `OfflinePreLearner`.
+offline_prelearner = ImageOfflinePreLearner(
+    config=config,
+    learner=None,
+    spaces=(
+        config.observation_space,
+        config.action_space,
+    ),
+    module_spec=module_spec,
+)
+
+# Transform the raw data to `MultiAgentBatch` data.
+batch = offline_prelearner(batch)
+
+# Show the transformed batch.
+print(f"Batch: {batch}")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/pretrain_bc_single_agent_evaluate_as_multi_agent.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/pretrain_bc_single_agent_evaluate_as_multi_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..d965add3355279dee9ad8301560dc091ed5c65e4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/pretrain_bc_single_agent_evaluate_as_multi_agent.py
@@ -0,0 +1,176 @@
+# @HybridAPIStack
+
+"""Example showing how to train a (SA) BC RLModule while evaluating in a MA setup.
+
+Here, SA=single-agent and MA=multi-agent.
+
+Note that the BC Algorithm - by default - runs on the hybrid API stack, using RLModules,
+but not `ConnectorV2` and `SingleAgentEpisode` yet.
+
+This example:
+    - demonstrates how you can train a single-agent BC Policy (RLModule) from a JSON
+    file, which contains SampleBatch (expert or non-expert) data.
+    - shows how you can run evaluation in a multi-agent setup (for example vs one
+    or more heuristic policies), while training the BC Policy.
+
+
+How to run this script
+----------------------
+`python [script file name].py --checkpoint-at-end`
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+In the console output, you can see that the episode returns of the "main" policy on
+the evaluation track keep increasing as BC manages to more and more clone the behavior
+found in our (expert) JSON file.
+
+After 50-100 iterations, you should see the episode reward reach 450.0.
+Note that the opponent (random) policy does not learn as it's a) not a trainable
+RLModule and b) not being trained via the BCConfig. It's only used for evaluation
+purposes here.
+
++---------------------+------------+-----------------+--------+--------+
+| Trial name          | status     | loc             |   iter |     ts |
+|---------------------+------------+-----------------+--------+--------+
+| BC_None_ee65e_00000 | TERMINATED | 127.0.0.1:35031 |     93 | 203754 |
++---------------------+------------+-----------------+--------+--------+
++----------------------+------------------------+
+|   eps. return (main) |   eps. return (random) |
+|----------------------+------------------------|
+|                452.4 |                   28.3 |
++----------------------+------------------------+
+"""
+import os
+from pathlib import Path
+
+import gymnasium as gym
+
+from ray import tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.bc import BCConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.examples._old_api_stack.policy.random_policy import RandomPolicy
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EVALUATION_RESULTS,
+    NUM_ENV_STEPS_TRAINED,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.train.constants import TIME_TOTAL_S
+from ray.tune.registry import register_env
+
+parser = add_rllib_example_script_args(
+    default_reward=450.0,
+    default_timesteps=300000,
+)
+parser.set_defaults(num_agents=2)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    register_env("multi_cart", lambda cfg: MultiAgentCartPole(cfg))
+    dummy_env = gym.make("CartPole-v1")
+
+    rllib_dir = Path(__file__).parent.parent.parent
+    print(f"rllib dir={rllib_dir}")
+    offline_file = os.path.join(rllib_dir, "tests/data/cartpole/large.json")
+
+    base_config = (
+        BCConfig()
+        # For offline RL, we do not specify an env here (b/c we don't want any env
+        # instances created on the EnvRunners). Instead, we'll provide observation-
+        # and action-spaces here for the RLModule to know its input- and output types.
+        .environment(
+            observation_space=dummy_env.observation_space,
+            action_space=dummy_env.action_space,
+        )
+        .offline_data(
+            input_=offline_file,
+            # The number of iterations to be run per learner when in multi-learner
+            # mode in a single RLlib training iteration. Leave this to `None` to
+            # run an entire epoch on the dataset during a single RLlib training
+            # iteration. For single-learner mode, 1 is the only option.
+            dataset_num_iters_per_learner=1 if not args.num_learners else None,
+        )
+        .multi_agent(
+            policies={"main"},
+            policy_mapping_fn=lambda *a, **kw: "main",
+        )
+        .evaluation(
+            evaluation_interval=1,
+            evaluation_num_env_runners=0,
+            evaluation_config=BCConfig.overrides(
+                # Evaluate on an actual env -> switch input back to "sampler".
+                input_="sampler",
+                # Do not explore during evaluation, but act greedily.
+                explore=False,
+                # Use a multi-agent setup for evaluation.
+                env="multi_cart",
+                env_config={"num_agents": args.num_agents},
+                policies={
+                    "main": PolicySpec(),
+                    "random": PolicySpec(policy_class=RandomPolicy),
+                },
+                # Only control agent 0 with the main (trained) policy.
+                policy_mapping_fn=(
+                    lambda aid, *a, **kw: "main" if aid == 0 else "random"
+                ),
+                # Note that we do NOT have to specify the `policies_to_train` here,
+                # b/c we are inside the evaluation config (no policy is trained during
+                # evaluation). The fact that the BCConfig above is "only" setup
+                # as single-agent makes it automatically only train the policy found in
+                # the BCConfig's `policies` field (which is "main").
+                # policies_to_train=["main"],
+            ),
+        )
+    )
+
+    policy_eval_returns = (
+        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/policy_reward_mean/"
+    )
+
+    stop = {
+        # Check for the "main" policy's episode return, not the combined one.
+        # The combined one is the sum of the "main" policy + the "random" one.
+        policy_eval_returns + "main": args.stop_reward,
+        NUM_ENV_STEPS_TRAINED: args.stop_timesteps,
+        TRAINING_ITERATION: args.stop_iters,
+    }
+
+    run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        success_metric={policy_eval_returns + "main": args.stop_reward},
+        # We use a special progress reporter here to show the evaluation results (of the
+        # "main" policy).
+        # In the following dict, the keys are the (possibly nested) keys that can be
+        # found in RLlib's (BC's) result dict, produced at every training iteration, and
+        # the values are the column names you would like to see in your console reports.
+        # Note that for nested result dict keys, you need to use slashes "/" to define
+        # the exact path.
+        progress_reporter=tune.CLIReporter(
+            metric_columns={
+                TRAINING_ITERATION: "iter",
+                TIME_TOTAL_S: "total time (s)",
+                NUM_ENV_STEPS_TRAINED: "ts",
+                policy_eval_returns + "main": "eps. return (main)",
+                policy_eval_returns + "random": "eps. return (random)",
+            }
+        ),
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/saving_experiences.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/saving_experiences.py
new file mode 100644
index 0000000000000000000000000000000000000000..27c76c264da98c74ff9ebd26408de8fc85dfedc8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/saving_experiences.py
@@ -0,0 +1,62 @@
+# @OldAPIStack
+
+"""Simple example of writing experiences to a file using JsonWriter."""
+
+# __sphinx_doc_begin__
+import gymnasium as gym
+import numpy as np
+import os
+
+import ray._private.utils
+
+from ray.rllib.models.preprocessors import get_preprocessor
+from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder
+from ray.rllib.offline.json_writer import JsonWriter
+
+if __name__ == "__main__":
+    batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
+    writer = JsonWriter(
+        os.path.join(ray._private.utils.get_user_temp_dir(), "demo-out")
+    )
+
+    # You normally wouldn't want to manually create sample batches if a
+    # simulator is available, but let's do it anyways for example purposes:
+    env = gym.make("CartPole-v1")
+
+    # RLlib uses preprocessors to implement transforms such as one-hot encoding
+    # and flattening of tuple and dict observations. For CartPole a no-op
+    # preprocessor is used, but this may be relevant for more complex envs.
+    prep = get_preprocessor(env.observation_space)(env.observation_space)
+    print("The preprocessor is", prep)
+
+    for eps_id in range(100):
+        obs, info = env.reset()
+        prev_action = np.zeros_like(env.action_space.sample())
+        prev_reward = 0
+        terminated = truncated = False
+        t = 0
+        while not terminated and not truncated:
+            action = env.action_space.sample()
+            new_obs, rew, terminated, truncated, info = env.step(action)
+            batch_builder.add_values(
+                t=t,
+                eps_id=eps_id,
+                agent_index=0,
+                obs=prep.transform(obs),
+                actions=action,
+                action_prob=1.0,  # put the true action probability here
+                action_logp=0.0,
+                rewards=rew,
+                prev_actions=prev_action,
+                prev_rewards=prev_reward,
+                terminateds=terminated,
+                truncateds=truncated,
+                infos=info,
+                new_obs=prep.transform(new_obs),
+            )
+            obs = new_obs
+            prev_action = action
+            prev_reward = rew
+            t += 1
+        writer.write(batch_builder.build_and_reset())
+# __sphinx_doc_end__