diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/connector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/connector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54bd4887b2e288b3cf93d6f4d8a05c34b02dd31b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/connector.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/connector_v2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/connector_v2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd72695dd31679152056b03ff0cecb320f6e5273
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/connector_v2.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/registry.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/registry.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da7e068c6cd01cead1823ee5dffdc47e4b032451
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/registry.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/util.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/util.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f99ccf5452bc0f8bbb313153aab2cc043ca1730c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/__pycache__/util.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/lambdas.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/lambdas.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e182e49d60e5c1386967d594408c95350ff0a30b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/lambdas.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/mean_std_filter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/mean_std_filter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a49909abdd8c0096fb662557e589eb67de5ac92f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/mean_std_filter.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/obs_preproc.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/obs_preproc.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8753675f53c1dc1e8aff3064348708513da73de
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/obs_preproc.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/state_buffer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/state_buffer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1af36a01fe22bccfa5b8b297d4f31adadb0c353
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/state_buffer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/clip_reward.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/clip_reward.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d6c89916c97ebd8b2ede36d840c37de2a602883
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/clip_reward.py
@@ -0,0 +1,56 @@
+from typing import Any
+
+import numpy as np
+
+from ray.rllib.connectors.connector import (
+    AgentConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.typing import AgentConnectorDataType
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+class ClipRewardAgentConnector(AgentConnector):
+    def __init__(self, ctx: ConnectorContext, sign=False, limit=None):
+        super().__init__(ctx)
+        assert (
+            not sign or not limit
+        ), "should not enable both sign and limit reward clipping."
+        self.sign = sign
+        self.limit = limit
+
+    def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType:
+        d = ac_data.data
+        assert (
+            type(d) is dict
+        ), "Single agent data must be of type Dict[str, TensorStructType]"
+
+        if SampleBatch.REWARDS not in d:
+            # Nothing to clip. May happen for initial obs.
+            return ac_data
+
+        if self.sign:
+            d[SampleBatch.REWARDS] = np.sign(d[SampleBatch.REWARDS])
+        elif self.limit:
+            d[SampleBatch.REWARDS] = np.clip(
+                d[SampleBatch.REWARDS],
+                a_min=-self.limit,
+                a_max=self.limit,
+            )
+        return ac_data
+
+    def to_state(self):
+        return ClipRewardAgentConnector.__name__, {
+            "sign": self.sign,
+            "limit": self.limit,
+        }
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        return ClipRewardAgentConnector(ctx, **params)
+
+
+register_connector(ClipRewardAgentConnector.__name__, ClipRewardAgentConnector)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/env_sampling.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/env_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0ba6f0a4384caccd6647edde520cb7a84f0dc65
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/env_sampling.py
@@ -0,0 +1,30 @@
+from typing import Any
+
+from ray.rllib.connectors.connector import (
+    AgentConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.utils.typing import AgentConnectorDataType
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+class EnvSamplingAgentConnector(AgentConnector):
+    def __init__(self, ctx: ConnectorContext, sign=False, limit=None):
+        super().__init__(ctx)
+        self.observation_space = ctx.observation_space
+
+    def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType:
+        # EnvSamplingAgentConnector is a no-op connector.
+        return ac_data
+
+    def to_state(self):
+        return EnvSamplingAgentConnector.__name__, {}
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        return EnvSamplingAgentConnector(ctx, **params)
+
+
+register_connector(EnvSamplingAgentConnector.__name__, EnvSamplingAgentConnector)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/lambdas.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/lambdas.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a714a0df982e36bce96c33ddfb6f6e9ce05188
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/lambdas.py
@@ -0,0 +1,86 @@
+from typing import Any, Callable, Type
+
+import numpy as np
+import tree  # dm_tree
+
+from ray.rllib.connectors.connector import (
+    AgentConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.typing import (
+    AgentConnectorDataType,
+    AgentConnectorsOutput,
+)
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+def register_lambda_agent_connector(
+    name: str, fn: Callable[[Any], Any]
+) -> Type[AgentConnector]:
+    """A util to register any simple transforming function as an AgentConnector
+
+    The only requirement is that fn should take a single data object and return
+    a single data object.
+
+    Args:
+        name: Name of the resulting actor connector.
+        fn: The function that transforms env / agent data.
+
+    Returns:
+        A new AgentConnector class that transforms data using fn.
+    """
+
+    class LambdaAgentConnector(AgentConnector):
+        def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType:
+            return AgentConnectorDataType(
+                ac_data.env_id, ac_data.agent_id, fn(ac_data.data)
+            )
+
+        def to_state(self):
+            return name, None
+
+        @staticmethod
+        def from_state(ctx: ConnectorContext, params: Any):
+            return LambdaAgentConnector(ctx)
+
+    LambdaAgentConnector.__name__ = name
+    LambdaAgentConnector.__qualname__ = name
+
+    register_connector(name, LambdaAgentConnector)
+
+    return LambdaAgentConnector
+
+
+@OldAPIStack
+def flatten_data(data: AgentConnectorsOutput):
+    assert isinstance(
+        data, AgentConnectorsOutput
+    ), "Single agent data must be of type AgentConnectorsOutput"
+
+    raw_dict = data.raw_dict
+    sample_batch = data.sample_batch
+
+    flattened = {}
+    for k, v in sample_batch.items():
+        if k in [SampleBatch.INFOS, SampleBatch.ACTIONS] or k.startswith("state_out_"):
+            # Do not flatten infos, actions, and state_out_ columns.
+            flattened[k] = v
+            continue
+        if v is None:
+            # Keep the same column shape.
+            flattened[k] = None
+            continue
+        flattened[k] = np.array(tree.flatten(v))
+    flattened = SampleBatch(flattened, is_training=False)
+
+    return AgentConnectorsOutput(raw_dict, flattened)
+
+
+# Agent connector to build and return a flattened observation SampleBatch
+# in addition to the original input dict.
+FlattenDataAgentConnector = OldAPIStack(
+    register_lambda_agent_connector("FlattenDataAgentConnector", flatten_data)
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/mean_std_filter.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/mean_std_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2079344a203d7e08970672fe8fefcaa2caa1bb1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/mean_std_filter.py
@@ -0,0 +1,187 @@
+from typing import Any, List
+from gymnasium.spaces import Discrete, MultiDiscrete
+
+import numpy as np
+import tree
+
+from ray.rllib.connectors.agent.synced_filter import SyncedFilterAgentConnector
+from ray.rllib.connectors.connector import AgentConnector
+from ray.rllib.connectors.connector import (
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.filter import Filter
+from ray.rllib.utils.filter import MeanStdFilter, ConcurrentMeanStdFilter
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.typing import AgentConnectorDataType
+from ray.rllib.utils.filter import RunningStat
+
+
+@OldAPIStack
+class MeanStdObservationFilterAgentConnector(SyncedFilterAgentConnector):
+    """A connector used to mean-std-filter observations.
+
+    Incoming observations are filtered such that the output of this filter is on
+    average zero and has a standard deviation of 1. This filtering is applied
+    separately per element of the observation space.
+    """
+
+    def __init__(
+        self,
+        ctx: ConnectorContext,
+        demean: bool = True,
+        destd: bool = True,
+        clip: float = 10.0,
+    ):
+        SyncedFilterAgentConnector.__init__(self, ctx)
+        # We simply use the old MeanStdFilter until non-connector env_runner is fully
+        # deprecated to avoid duplicate code
+
+        filter_shape = tree.map_structure(
+            lambda s: (
+                None
+                if isinstance(s, (Discrete, MultiDiscrete))  # noqa
+                else np.array(s.shape)
+            ),
+            get_base_struct_from_space(ctx.observation_space),
+        )
+        self.filter = MeanStdFilter(filter_shape, demean=demean, destd=destd, clip=clip)
+
+    def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType:
+        d = ac_data.data
+        assert (
+            type(d) is dict
+        ), "Single agent data must be of type Dict[str, TensorStructType]"
+        if SampleBatch.OBS in d:
+            d[SampleBatch.OBS] = self.filter(
+                d[SampleBatch.OBS], update=self._is_training
+            )
+        if SampleBatch.NEXT_OBS in d:
+            d[SampleBatch.NEXT_OBS] = self.filter(
+                d[SampleBatch.NEXT_OBS], update=self._is_training
+            )
+
+        return ac_data
+
+    def to_state(self):
+        # Flattening is deterministic
+        flattened_rs = tree.flatten(self.filter.running_stats)
+        flattened_buffer = tree.flatten(self.filter.buffer)
+        return MeanStdObservationFilterAgentConnector.__name__, {
+            "shape": self.filter.shape,
+            "no_preprocessor": self.filter.no_preprocessor,
+            "demean": self.filter.demean,
+            "destd": self.filter.destd,
+            "clip": self.filter.clip,
+            "running_stats": [s.to_state() for s in flattened_rs],
+            "buffer": [s.to_state() for s in flattened_buffer],
+        }
+
+    # demean, destd, clip, and a state dict
+    @staticmethod
+    def from_state(
+        ctx: ConnectorContext,
+        params: List[Any] = None,
+        demean: bool = True,
+        destd: bool = True,
+        clip: float = 10.0,
+    ):
+        connector = MeanStdObservationFilterAgentConnector(ctx, demean, destd, clip)
+        if params:
+            connector.filter.shape = params["shape"]
+            connector.filter.no_preprocessor = params["no_preprocessor"]
+            connector.filter.demean = params["demean"]
+            connector.filter.destd = params["destd"]
+            connector.filter.clip = params["clip"]
+
+            # Unflattening is deterministic
+            running_stats = [RunningStat.from_state(s) for s in params["running_stats"]]
+            connector.filter.running_stats = tree.unflatten_as(
+                connector.filter.shape, running_stats
+            )
+
+            # Unflattening is deterministic
+            buffer = [RunningStat.from_state(s) for s in params["buffer"]]
+            connector.filter.buffer = tree.unflatten_as(connector.filter.shape, buffer)
+
+        return connector
+
+    def reset_state(self) -> None:
+        """Creates copy of current state and resets accumulated state"""
+        if not self._is_training:
+            raise ValueError(
+                "State of {} can only be changed when trainin.".format(self.__name__)
+            )
+        self.filter.reset_buffer()
+
+    def apply_changes(self, other: "Filter", *args, **kwargs) -> None:
+        """Updates self with state from other filter."""
+        # inline this as soon as we deprecate ordinary filter with non-connector
+        # env_runner
+        if not self._is_training:
+            raise ValueError(
+                "Changes can only be applied to {} when trainin.".format(self.__name__)
+            )
+        return self.filter.apply_changes(other, *args, **kwargs)
+
+    def copy(self) -> "Filter":
+        """Creates a new object with same state as self.
+
+        This is a legacy Filter method that we need to keep around for now
+
+        Returns:
+            A copy of self.
+        """
+        # inline this as soon as we deprecate ordinary filter with non-connector
+        # env_runner
+        return self.filter.copy()
+
+    def sync(self, other: "AgentConnector") -> None:
+        """Copies all state from other filter to self."""
+        # inline this as soon as we deprecate ordinary filter with non-connector
+        # env_runner
+        if not self._is_training:
+            raise ValueError(
+                "{} can only be synced when trainin.".format(self.__name__)
+            )
+        return self.filter.sync(other.filter)
+
+
+@OldAPIStack
+class ConcurrentMeanStdObservationFilterAgentConnector(
+    MeanStdObservationFilterAgentConnector
+):
+    """A concurrent version of the MeanStdObservationFilterAgentConnector.
+
+    This version's filter has all operations wrapped by a threading.RLock.
+    It can therefore be safely used by multiple threads.
+    """
+
+    def __init__(self, ctx: ConnectorContext, demean=True, destd=True, clip=10.0):
+        SyncedFilterAgentConnector.__init__(self, ctx)
+        # We simply use the old MeanStdFilter until non-connector env_runner is fully
+        # deprecated to avoid duplicate code
+
+        filter_shape = tree.map_structure(
+            lambda s: (
+                None
+                if isinstance(s, (Discrete, MultiDiscrete))  # noqa
+                else np.array(s.shape)
+            ),
+            get_base_struct_from_space(ctx.observation_space),
+        )
+        self.filter = ConcurrentMeanStdFilter(
+            filter_shape, demean=True, destd=True, clip=10.0
+        )
+
+
+register_connector(
+    MeanStdObservationFilterAgentConnector.__name__,
+    MeanStdObservationFilterAgentConnector,
+)
+register_connector(
+    ConcurrentMeanStdObservationFilterAgentConnector.__name__,
+    ConcurrentMeanStdObservationFilterAgentConnector,
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/obs_preproc.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/obs_preproc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f783bb6718cca79a26752e2c961f880968a1ce58
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/obs_preproc.py
@@ -0,0 +1,69 @@
+from typing import Any
+
+from ray.rllib.connectors.connector import (
+    AgentConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.models.preprocessors import get_preprocessor, NoPreprocessor
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.typing import AgentConnectorDataType
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+class ObsPreprocessorConnector(AgentConnector):
+    """A connector that wraps around existing RLlib observation preprocessors.
+
+    This includes:
+    - OneHotPreprocessor for Discrete and Multi-Discrete spaces.
+    - GenericPixelPreprocessor and AtariRamPreprocessor for Atari spaces.
+    - TupleFlatteningPreprocessor and DictFlatteningPreprocessor for flattening
+      arbitrary nested input observations.
+    - RepeatedValuesPreprocessor for padding observations from RLlib Repeated
+      observation space.
+    """
+
+    def __init__(self, ctx: ConnectorContext):
+        super().__init__(ctx)
+
+        if hasattr(ctx.observation_space, "original_space"):
+            # ctx.observation_space is the space this Policy deals with.
+            # We need to preprocess data from the original observation space here.
+            obs_space = ctx.observation_space.original_space
+        else:
+            obs_space = ctx.observation_space
+
+        self._preprocessor = get_preprocessor(obs_space)(
+            obs_space, ctx.config.get("model", {})
+        )
+
+    def is_identity(self):
+        """Returns whether this preprocessor connector is a no-op preprocessor."""
+        return isinstance(self._preprocessor, NoPreprocessor)
+
+    def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType:
+        d = ac_data.data
+        assert type(d) is dict, (
+            "Single agent data must be of type Dict[str, TensorStructType] but is of "
+            "type {}".format(type(d))
+        )
+
+        if SampleBatch.OBS in d:
+            d[SampleBatch.OBS] = self._preprocessor.transform(d[SampleBatch.OBS])
+        if SampleBatch.NEXT_OBS in d:
+            d[SampleBatch.NEXT_OBS] = self._preprocessor.transform(
+                d[SampleBatch.NEXT_OBS]
+            )
+
+        return ac_data
+
+    def to_state(self):
+        return ObsPreprocessorConnector.__name__, None
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        return ObsPreprocessorConnector(ctx)
+
+
+register_connector(ObsPreprocessorConnector.__name__, ObsPreprocessorConnector)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/pipeline.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..898ac79b1c709429e4751eca7eb427b3afa26a4e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/pipeline.py
@@ -0,0 +1,72 @@
+import logging
+from typing import Any, List
+from collections import defaultdict
+
+from ray.rllib.connectors.connector import (
+    AgentConnector,
+    Connector,
+    ConnectorContext,
+    ConnectorPipeline,
+)
+from ray.rllib.connectors.registry import get_connector, register_connector
+from ray.rllib.utils.typing import ActionConnectorDataType, AgentConnectorDataType
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.util.timer import _Timer
+
+
+logger = logging.getLogger(__name__)
+
+
+@OldAPIStack
+class AgentConnectorPipeline(ConnectorPipeline, AgentConnector):
+    def __init__(self, ctx: ConnectorContext, connectors: List[Connector]):
+        super().__init__(ctx, connectors)
+        self.timers = defaultdict(_Timer)
+
+    def reset(self, env_id: str):
+        for c in self.connectors:
+            c.reset(env_id)
+
+    def on_policy_output(self, output: ActionConnectorDataType):
+        for c in self.connectors:
+            c.on_policy_output(output)
+
+    def __call__(
+        self, acd_list: List[AgentConnectorDataType]
+    ) -> List[AgentConnectorDataType]:
+        ret = acd_list
+        for c in self.connectors:
+            timer = self.timers[str(c)]
+            with timer:
+                ret = c(ret)
+        return ret
+
+    def to_state(self):
+        children = []
+        for c in self.connectors:
+            state = c.to_state()
+            assert isinstance(state, tuple) and len(state) == 2, (
+                "Serialized connector state must be in the format of "
+                f"Tuple[name: str, params: Any]. Instead we got {state}"
+                f"for connector {c.__name__}."
+            )
+            children.append(state)
+        return AgentConnectorPipeline.__name__, children
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: List[Any]):
+        assert (
+            type(params) is list
+        ), "AgentConnectorPipeline takes a list of connector params."
+        connectors = []
+        for state in params:
+            try:
+                name, subparams = state
+                connectors.append(get_connector(name, ctx, subparams))
+            except Exception as e:
+                logger.error(f"Failed to de-serialize connector state: {state}")
+                raise e
+        return AgentConnectorPipeline(ctx, connectors)
+
+
+register_connector(AgentConnectorPipeline.__name__, AgentConnectorPipeline)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/state_buffer.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/state_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4516abd8bbe0ad47d6cc96baedb8909ff26b62fd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/state_buffer.py
@@ -0,0 +1,120 @@
+from collections import defaultdict
+import logging
+import pickle
+from typing import Any
+
+import numpy as np
+from ray.rllib.utils.annotations import override
+import tree  # dm_tree
+
+from ray.rllib.connectors.connector import (
+    AgentConnector,
+    Connector,
+    ConnectorContext,
+)
+from ray import cloudpickle
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.core.columns import Columns
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.typing import ActionConnectorDataType, AgentConnectorDataType
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+logger = logging.getLogger(__name__)
+
+
+@OldAPIStack
+class StateBufferConnector(AgentConnector):
+    def __init__(self, ctx: ConnectorContext, states: Any = None):
+        super().__init__(ctx)
+
+        self._initial_states = ctx.initial_states
+        self._action_space_struct = get_base_struct_from_space(ctx.action_space)
+
+        self._states = defaultdict(lambda: defaultdict(lambda: (None, None, None)))
+        self._enable_new_api_stack = False
+        # TODO(jungong) : we would not need this if policies are never stashed
+        # during the rollout of a single episode.
+        if states:
+            try:
+                self._states = cloudpickle.loads(states)
+            except pickle.UnpicklingError:
+                # StateBufferConnector states are only needed for rare cases
+                # like stashing then restoring a policy during the rollout of
+                # a single episode.
+                # It is ok to ignore the error for most of the cases here.
+                logger.info(
+                    "Can not restore StateBufferConnector states. This warning can "
+                    "usually be ignore, unless it is from restoring a stashed policy."
+                )
+
+    @override(Connector)
+    def in_eval(self):
+        super().in_eval()
+
+    def reset(self, env_id: str):
+        # States should not be carried over between episodes.
+        if env_id in self._states:
+            del self._states[env_id]
+
+    def on_policy_output(self, ac_data: ActionConnectorDataType):
+        # Buffer latest output states for next input __call__.
+        self._states[ac_data.env_id][ac_data.agent_id] = ac_data.output
+
+    def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType:
+        d = ac_data.data
+        assert (
+            type(d) is dict
+        ), "Single agent data must be of type Dict[str, TensorStructType]"
+
+        env_id = ac_data.env_id
+        agent_id = ac_data.agent_id
+        assert (
+            env_id is not None and agent_id is not None
+        ), f"StateBufferConnector requires env_id(f{env_id}) and agent_id(f{agent_id})"
+
+        action, states, fetches = self._states[env_id][agent_id]
+
+        if action is not None:
+            d[SampleBatch.ACTIONS] = action  # Last action
+        else:
+            # Default zero action.
+            d[SampleBatch.ACTIONS] = tree.map_structure(
+                lambda s: np.zeros_like(s.sample(), s.dtype)
+                if hasattr(s, "dtype")
+                else np.zeros_like(s.sample()),
+                self._action_space_struct,
+            )
+
+        if states is None:
+            states = self._initial_states
+        if self._enable_new_api_stack:
+            if states:
+                d[Columns.STATE_OUT] = states
+        else:
+            for i, v in enumerate(states):
+                d["state_out_{}".format(i)] = v
+
+        # Also add extra fetches if available.
+        if fetches:
+            d.update(fetches)
+
+        return ac_data
+
+    def to_state(self):
+        # Note(jungong) : it is ok to use cloudpickle here for stats because:
+        # 1. self._states may contain arbitary data objects, and will be hard
+        #     to serialize otherwise.
+        # 2. seriazlized states are only useful if a policy is stashed and
+        #     restored during the rollout of a single episode. So it is ok to
+        #     use cloudpickle for such non-persistent data bits.
+        states = cloudpickle.dumps(self._states)
+        return StateBufferConnector.__name__, states
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        return StateBufferConnector(ctx, params)
+
+
+register_connector(StateBufferConnector.__name__, StateBufferConnector)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/synced_filter.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/synced_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..51c625d9aceec9401599ef87b42cf522704c5ea3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/synced_filter.py
@@ -0,0 +1,52 @@
+from ray.rllib.connectors.connector import (
+    AgentConnector,
+    ConnectorContext,
+)
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.filter import Filter
+
+
+@OldAPIStack
+class SyncedFilterAgentConnector(AgentConnector):
+    """An agent connector that filters with synchronized parameters."""
+
+    def __init__(self, ctx: ConnectorContext, *args, **kwargs):
+        super().__init__(ctx)
+        if args or kwargs:
+            raise ValueError(
+                "SyncedFilterAgentConnector does not take any additional arguments, "
+                "but got args=`{}` and kwargs={}.".format(args, kwargs)
+            )
+
+    def apply_changes(self, other: "Filter", *args, **kwargs) -> None:
+        """Updates self with state from other filter."""
+        # TODO: (artur) inline this as soon as we deprecate ordinary filter with
+        #  non-connecto env_runner
+        return self.filter.apply_changes(other, *args, **kwargs)
+
+    def copy(self) -> "Filter":
+        """Creates a new object with same state as self.
+
+        This is a legacy Filter method that we need to keep around for now
+
+        Returns:
+            A copy of self.
+        """
+        # inline this as soon as we deprecate ordinary filter with non-connector
+        # env_runner
+        return self.filter.copy()
+
+    def sync(self, other: "AgentConnector") -> None:
+        """Copies all state from other filter to self."""
+        # TODO: (artur) inline this as soon as we deprecate ordinary filter with
+        #  non-connector env_runner
+        return self.filter.sync(other.filter)
+
+    def reset_state(self) -> None:
+        """Creates copy of current state and resets accumulated state"""
+        raise NotImplementedError
+
+    def as_serializable(self) -> "Filter":
+        # TODO: (artur) inline this as soon as we deprecate ordinary filter with
+        #  non-connector env_runner
+        return self.filter.as_serializable()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/view_requirement.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/view_requirement.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f035bb97a92c7856ecc11e6dc7eac169a823a51
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/view_requirement.py
@@ -0,0 +1,135 @@
+from collections import defaultdict
+from typing import Any
+
+from ray.rllib.connectors.connector import (
+    AgentConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.typing import (
+    AgentConnectorDataType,
+    AgentConnectorsOutput,
+)
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.evaluation.collectors.agent_collector import AgentCollector
+
+
+@OldAPIStack
+class ViewRequirementAgentConnector(AgentConnector):
+    """This connector does 2 things:
+    1. It filters data columns based on view_requirements for training and inference.
+    2. It buffers the right amount of history for computing the sample batch for
+       action computation.
+    The output of this connector is AgentConnectorsOut, which basically is
+    a tuple of 2 things:
+    {
+        "raw_dict": {"obs": ...}
+        "sample_batch": SampleBatch
+    }
+    raw_dict, which contains raw data for the latest time slice,
+    can be used to construct a complete episode by Sampler for training purpose.
+    The "for_action" SampleBatch can be used to directly call the policy.
+    """
+
+    def __init__(self, ctx: ConnectorContext):
+        super().__init__(ctx)
+
+        self._view_requirements = ctx.view_requirements
+        _enable_new_api_stack = False
+
+        # a dict of env_id to a dict of agent_id to a list of agent_collector objects
+        self.agent_collectors = defaultdict(
+            lambda: defaultdict(
+                lambda: AgentCollector(
+                    self._view_requirements,
+                    max_seq_len=ctx.config["model"]["max_seq_len"],
+                    intial_states=ctx.initial_states,
+                    disable_action_flattening=ctx.config.get(
+                        "_disable_action_flattening", False
+                    ),
+                    is_policy_recurrent=ctx.is_policy_recurrent,
+                    # Note(jungong): We only leverage AgentCollector for building sample
+                    # batches for computing actions.
+                    # So regardless of whether this ViewRequirement connector is in
+                    # training or inference mode, we should tell these AgentCollectors
+                    # to behave in inference mode, so they don't accumulate episode data
+                    # that is not useful for inference.
+                    is_training=False,
+                    _enable_new_api_stack=_enable_new_api_stack,
+                )
+            )
+        )
+
+    def reset(self, env_id: str):
+        if env_id in self.agent_collectors:
+            del self.agent_collectors[env_id]
+
+    def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType:
+        d = ac_data.data
+        assert (
+            type(d) is dict
+        ), "Single agent data must be of type Dict[str, TensorStructType]"
+
+        env_id = ac_data.env_id
+        agent_id = ac_data.agent_id
+        # TODO: we don't keep episode_id around so use env_id as episode_id ?
+        episode_id = env_id if SampleBatch.EPS_ID not in d else d[SampleBatch.EPS_ID]
+
+        assert env_id is not None and agent_id is not None, (
+            f"ViewRequirementAgentConnector requires env_id({env_id}) "
+            "and agent_id({agent_id})"
+        )
+
+        assert (
+            self._view_requirements
+        ), "ViewRequirements required by ViewRequirementAgentConnector"
+
+        # Note(jungong) : we need to keep the entire input dict here.
+        # A column may be used by postprocessing (GAE) even if its
+        # view_requirement.used_for_training is False.
+        training_dict = d
+
+        agent_collector = self.agent_collectors[env_id][agent_id]
+
+        if SampleBatch.NEXT_OBS not in d:
+            raise ValueError(f"connector data {d} should contain next_obs.")
+        # TODO(avnishn; kourosh) Unsure how agent_index is necessary downstream
+        # since there is no mapping from agent_index to agent_id that exists.
+        # need to remove this from the SampleBatch later.
+        # fall back to using dummy index if no index is available
+        if SampleBatch.AGENT_INDEX in d:
+            agent_index = d[SampleBatch.AGENT_INDEX]
+        else:
+            try:
+                agent_index = float(agent_id)
+            except ValueError:
+                agent_index = -1
+        if agent_collector.is_empty():
+            agent_collector.add_init_obs(
+                episode_id=episode_id,
+                agent_index=agent_index,
+                env_id=env_id,
+                init_obs=d[SampleBatch.NEXT_OBS],
+                init_infos=d.get(SampleBatch.INFOS),
+            )
+        else:
+            agent_collector.add_action_reward_next_obs(d)
+        sample_batch = agent_collector.build_for_inference()
+
+        return_data = AgentConnectorDataType(
+            env_id, agent_id, AgentConnectorsOutput(training_dict, sample_batch)
+        )
+        return return_data
+
+    def to_state(self):
+        return ViewRequirementAgentConnector.__name__, None
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        return ViewRequirementAgentConnector(ctx)
+
+
+register_connector(
+    ViewRequirementAgentConnector.__name__, ViewRequirementAgentConnector
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96fd6cd51af65e54c933117976a3cc5a9976d42b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__init__.py
@@ -0,0 +1,43 @@
+from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import (
+    AddObservationsFromEpisodesToBatch,
+)
+from ray.rllib.connectors.common.add_states_from_episodes_to_batch import (
+    AddStatesFromEpisodesToBatch,
+)
+from ray.rllib.connectors.common.add_time_dim_to_batch_and_zero_pad import (
+    AddTimeDimToBatchAndZeroPad,
+)
+from ray.rllib.connectors.common.agent_to_module_mapping import AgentToModuleMapping
+from ray.rllib.connectors.common.batch_individual_items import BatchIndividualItems
+from ray.rllib.connectors.common.numpy_to_tensor import NumpyToTensor
+from ray.rllib.connectors.learner.add_columns_from_episodes_to_train_batch import (
+    AddColumnsFromEpisodesToTrainBatch,
+)
+from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import (  # noqa
+    AddNextObservationsFromEpisodesToTrainBatch,
+)
+from ray.rllib.connectors.learner.add_one_ts_to_episodes_and_truncate import (
+    AddOneTsToEpisodesAndTruncate,
+)
+from ray.rllib.connectors.learner.compute_returns_to_go import ComputeReturnsToGo
+from ray.rllib.connectors.learner.general_advantage_estimation import (
+    GeneralAdvantageEstimation,
+)
+from ray.rllib.connectors.learner.learner_connector_pipeline import (
+    LearnerConnectorPipeline,
+)
+
+__all__ = [
+    "AddColumnsFromEpisodesToTrainBatch",
+    "AddNextObservationsFromEpisodesToTrainBatch",
+    "AddObservationsFromEpisodesToBatch",
+    "AddOneTsToEpisodesAndTruncate",
+    "AddStatesFromEpisodesToBatch",
+    "AddTimeDimToBatchAndZeroPad",
+    "AgentToModuleMapping",
+    "BatchIndividualItems",
+    "ComputeReturnsToGo",
+    "GeneralAdvantageEstimation",
+    "LearnerConnectorPipeline",
+    "NumpyToTensor",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f44ae9a28b8128ffc8801d606b17f008eec52057
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_columns_from_episodes_to_train_batch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_columns_from_episodes_to_train_batch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..450004db37c94aebbb3810a555e947a29f680b9d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_columns_from_episodes_to_train_batch.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_next_observations_from_episodes_to_train_batch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_next_observations_from_episodes_to_train_batch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a70e5e7744be37b7115f22b0ce73964cca1320a8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_next_observations_from_episodes_to_train_batch.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/compute_returns_to_go.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/compute_returns_to_go.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22ecd106766ddefa0220a902ff5c891e25271a84
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/compute_returns_to_go.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/general_advantage_estimation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/general_advantage_estimation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce192b708d48a3943ed2c3043374db75d8cf2473
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/general_advantage_estimation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34212d0373b67030538dc2575213f58e5738a805
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/get_actions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/get_actions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e42bc291afbad229c998583677074d4ef24374f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/get_actions.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/module_to_env_pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/module_to_env_pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..979480291c34353fd8f6c96674d792d130c9076c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/module_to_env_pipeline.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/normalize_and_clip_actions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/normalize_and_clip_actions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e561477b22ad120d8115026b7b98909655c0554
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/normalize_and_clip_actions.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/remove_single_ts_time_rank_from_batch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/remove_single_ts_time_rank_from_batch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d8b579f2e0b3141a4ce19662f95761f898a0d36
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/remove_single_ts_time_rank_from_batch.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/unbatch_to_individual_items.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/unbatch_to_individual_items.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7150ca4395f9d682a77f0c61f11a9d313580461f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/module_to_env/__pycache__/unbatch_to_individual_items.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..438a8992c4340a1382b2484cf70f5aae1de4691f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/action_dist.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/action_dist.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..305e7c89062d95538df6060d0027859be99ccb34
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/action_dist.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/catalog.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/catalog.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f34762128f1e0d746f4aa98ec18da11af85a804b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/catalog.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/distributions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/distributions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dc8318074487d75a786b0ed2edde69adbc4aaa2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/distributions.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/modelv2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/modelv2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b33fb7ba59e453fc2f5608f293fc651683196db8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/modelv2.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/preprocessors.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/preprocessors.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad709c46246ac0d15694b434b72f8233d3add08d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/preprocessors.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/repeated_values.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/repeated_values.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e533e95573e55328946077a84f9f8722f62e496e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/repeated_values.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f3fd043257e9963118f8ada6a4f1bad2a610077
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8536bd9866b17066ff454d9cee1714d49b0d320
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/attention_net.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/attention_net.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7644bd62050737deb8ed1507ee56479882bc0747
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/attention_net.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/fcnet.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/fcnet.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4a8a31d0527870f5fdbdf48f1b672ca4612943d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/fcnet.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/recurrent_net.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/recurrent_net.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c468118452f46723185aa3e184f689ba53a640a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/recurrent_net.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_action_dist.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_action_dist.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e76bc524aae52a668972d85426ee1c3f5f3cead4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_action_dist.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_distributions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_distributions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bb28debfc4e67540968d19a49c6779819a46336
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_distributions.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_modelv2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_modelv2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a9bdcbb97ed640c72b31e80c8f77b3cb62f28ef
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/tf_modelv2.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/visionnet.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/visionnet.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f5a6dfee8cce5714982dddd8bb358cf16170e41
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/visionnet.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b840c42b17eff05c8e0c7d440a0f2963c9ed35a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__init__.py
@@ -0,0 +1,17 @@
+from ray.rllib.models.tf.layers.gru_gate import GRUGate
+from ray.rllib.models.tf.layers.noisy_layer import NoisyLayer
+from ray.rllib.models.tf.layers.relative_multi_head_attention import (
+    PositionalEmbedding,
+    RelativeMultiHeadAttention,
+)
+from ray.rllib.models.tf.layers.skip_connection import SkipConnection
+from ray.rllib.models.tf.layers.multi_head_attention import MultiHeadAttention
+
+__all__ = [
+    "GRUGate",
+    "MultiHeadAttention",
+    "NoisyLayer",
+    "PositionalEmbedding",
+    "RelativeMultiHeadAttention",
+    "SkipConnection",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4b6c23a1c4b4fc80b15a19b6266974636237db0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/gru_gate.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/gru_gate.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..493ee9bb0a1c354c8190738650729f2694a43aea
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/gru_gate.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/multi_head_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/multi_head_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e4e718b8171c2ebef53346f36e6635b3cb80b7f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/multi_head_attention.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/noisy_layer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/noisy_layer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3dfcce066cc81d706d359cdc64583f714af3e151
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/noisy_layer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/relative_multi_head_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/relative_multi_head_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26d6464c3875a4ae6a800e15204b684d9df81a69
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/relative_multi_head_attention.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/skip_connection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/skip_connection.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19c1521c15fa222a422c4be166b2a923dcaf1a7d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/__pycache__/skip_connection.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/gru_gate.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/gru_gate.py
new file mode 100644
index 0000000000000000000000000000000000000000..a41b23bbf534a15d0d3c71333bcba1bb0c0a6d3b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/gru_gate.py
@@ -0,0 +1,58 @@
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import TensorType, TensorShape
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util import log_once
+
+tf1, tf, tfv = try_import_tf()
+
+
+class GRUGate(tf.keras.layers.Layer if tf else object):
+    def __init__(self, init_bias: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+        self._init_bias = init_bias
+        if log_once("gru_gate"):
+            deprecation_warning(
+                old="rllib.models.tf.layers.GRUGate",
+            )
+
+    def build(self, input_shape: TensorShape):
+        h_shape, x_shape = input_shape
+        if x_shape[-1] != h_shape[-1]:
+            raise ValueError(
+                "Both inputs to GRUGate must have equal size in last axis!"
+            )
+
+        dim = int(h_shape[-1])
+        self._w_r = self.add_weight(shape=(dim, dim))
+        self._w_z = self.add_weight(shape=(dim, dim))
+        self._w_h = self.add_weight(shape=(dim, dim))
+
+        self._u_r = self.add_weight(shape=(dim, dim))
+        self._u_z = self.add_weight(shape=(dim, dim))
+        self._u_h = self.add_weight(shape=(dim, dim))
+
+        def bias_initializer(shape, dtype):
+            return tf.fill(shape, tf.cast(self._init_bias, dtype=dtype))
+
+        self._bias_z = self.add_weight(shape=(dim,), initializer=bias_initializer)
+
+    def call(self, inputs: TensorType, **kwargs) -> TensorType:
+        # Pass in internal state first.
+        h, X = inputs
+
+        r = tf.tensordot(X, self._w_r, axes=1) + tf.tensordot(h, self._u_r, axes=1)
+        r = tf.nn.sigmoid(r)
+
+        z = (
+            tf.tensordot(X, self._w_z, axes=1)
+            + tf.tensordot(h, self._u_z, axes=1)
+            - self._bias_z
+        )
+        z = tf.nn.sigmoid(z)
+
+        h_next = tf.tensordot(X, self._w_h, axes=1) + tf.tensordot(
+            (h * r), self._u_h, axes=1
+        )
+        h_next = tf.nn.tanh(h_next)
+
+        return (1 - z) * h + z * h_next
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/multi_head_attention.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/multi_head_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..595608989f0b7da66e640a041289ae646cb36ae4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/multi_head_attention.py
@@ -0,0 +1,61 @@
+"""
+[1] - Attention Is All You Need - Vaswani, Jones, Shazeer, Parmar,
+      Uszkoreit, Gomez, Kaiser - Google Brain/Research, U Toronto - 2017.
+      https://arxiv.org/pdf/1706.03762.pdf
+"""
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import TensorType
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util import log_once
+
+tf1, tf, tfv = try_import_tf()
+
+
+class MultiHeadAttention(tf.keras.layers.Layer if tf else object):
+    """A multi-head attention layer described in [1]."""
+
+    def __init__(self, out_dim: int, num_heads: int, head_dim: int, **kwargs):
+        super().__init__(**kwargs)
+
+        # No bias or non-linearity.
+        self._num_heads = num_heads
+        self._head_dim = head_dim
+        self._qkv_layer = tf.keras.layers.Dense(
+            3 * num_heads * head_dim, use_bias=False
+        )
+        self._linear_layer = tf.keras.layers.TimeDistributed(
+            tf.keras.layers.Dense(out_dim, use_bias=False)
+        )
+        if log_once("multi_head_attention"):
+            deprecation_warning(
+                old="rllib.models.tf.layers.MultiHeadAttention",
+            )
+
+    def call(self, inputs: TensorType) -> TensorType:
+        L = tf.shape(inputs)[1]  # length of segment
+        H = self._num_heads  # number of attention heads
+        D = self._head_dim  # attention head dimension
+
+        qkv = self._qkv_layer(inputs)
+
+        queries, keys, values = tf.split(qkv, 3, -1)
+        queries = queries[:, -L:]  # only query based on the segment
+
+        queries = tf.reshape(queries, [-1, L, H, D])
+        keys = tf.reshape(keys, [-1, L, H, D])
+        values = tf.reshape(values, [-1, L, H, D])
+
+        score = tf.einsum("bihd,bjhd->bijh", queries, keys)
+        score = score / D**0.5
+
+        # causal mask of the same length as the sequence
+        mask = tf.sequence_mask(tf.range(1, L + 1), dtype=score.dtype)
+        mask = mask[None, :, :, None]
+
+        masked_score = score * mask + 1e30 * (mask - 1.0)
+        wmat = tf.nn.softmax(masked_score, axis=2)
+
+        out = tf.einsum("bijh,bjhd->bihd", wmat, values)
+        shape = tf.concat([tf.shape(out)[:2], [H * D]], axis=0)
+        out = tf.reshape(out, shape)
+        return self._linear_layer(out)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/noisy_layer.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/noisy_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc149d5de13beee0e77fcda069e321850507633
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/noisy_layer.py
@@ -0,0 +1,118 @@
+import numpy as np
+
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.framework import (
+    get_variable,
+    try_import_tf,
+    TensorType,
+    TensorShape,
+)
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util import log_once
+
+tf1, tf, tfv = try_import_tf()
+
+
+class NoisyLayer(tf.keras.layers.Layer if tf else object):
+    r"""A Layer that adds learnable Noise to some previous layer's outputs.
+
+    Consists of:
+    - a common dense layer: y = w^{T}x + b
+    - a noisy layer: y = (w + \epsilon_w*\sigma_w)^{T}x +
+        (b+\epsilon_b*\sigma_b)
+    , where \epsilon are random variables sampled from factorized normal
+    distributions and \sigma are trainable variables which are expected to
+    vanish along the training procedure.
+    """
+
+    def __init__(
+        self, prefix: str, out_size: int, sigma0: float, activation: str = "relu"
+    ):
+        """Initializes a NoisyLayer object.
+
+        Args:
+            prefix:
+            out_size: Output size for Noisy Layer
+            sigma0: Initialization value for sigma_b (bias noise)
+            non_linear: Non-linear activation for Noisy Layer
+        """
+        super().__init__()
+        self.prefix = prefix
+        self.out_size = out_size
+        # TF noise generation can be unreliable on GPU
+        # If generating the noise on the CPU,
+        # lowering sigma0 to 0.1 may be helpful
+        self.sigma0 = sigma0  # 0.5~GPU, 0.1~CPU
+        self.activation = activation
+        # Variables.
+        self.w = None  # Weight matrix.
+        self.b = None  # Biases.
+        self.sigma_w = None  # Noise for weight matrix
+        self.sigma_b = None  # Noise for biases.
+        if log_once("noisy_layer"):
+            deprecation_warning(
+                old="rllib.models.tf.layers.NoisyLayer",
+            )
+
+    def build(self, input_shape: TensorShape):
+        in_size = int(input_shape[1])
+
+        self.sigma_w = get_variable(
+            value=tf.keras.initializers.RandomUniform(
+                minval=-1.0 / np.sqrt(float(in_size)),
+                maxval=1.0 / np.sqrt(float(in_size)),
+            ),
+            trainable=True,
+            tf_name=self.prefix + "_sigma_w",
+            shape=[in_size, self.out_size],
+            dtype=tf.float32,
+        )
+
+        self.sigma_b = get_variable(
+            value=tf.keras.initializers.Constant(self.sigma0 / np.sqrt(float(in_size))),
+            trainable=True,
+            tf_name=self.prefix + "_sigma_b",
+            shape=[self.out_size],
+            dtype=tf.float32,
+        )
+
+        self.w = get_variable(
+            value=tf.keras.initializers.GlorotUniform(),
+            tf_name=self.prefix + "_fc_w",
+            trainable=True,
+            shape=[in_size, self.out_size],
+            dtype=tf.float32,
+        )
+
+        self.b = get_variable(
+            value=tf.keras.initializers.Zeros(),
+            tf_name=self.prefix + "_fc_b",
+            trainable=True,
+            shape=[self.out_size],
+            dtype=tf.float32,
+        )
+
+    def call(self, inputs: TensorType) -> TensorType:
+        in_size = int(inputs.shape[1])
+        epsilon_in = tf.random.normal(shape=[in_size])
+        epsilon_out = tf.random.normal(shape=[self.out_size])
+        epsilon_in = self._f_epsilon(epsilon_in)
+        epsilon_out = self._f_epsilon(epsilon_out)
+        epsilon_w = tf.matmul(
+            a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0)
+        )
+        epsilon_b = epsilon_out
+
+        action_activation = (
+            tf.matmul(inputs, self.w + self.sigma_w * epsilon_w)
+            + self.b
+            + self.sigma_b * epsilon_b
+        )
+
+        fn = get_activation_fn(self.activation, framework="tf")
+        if fn is not None:
+            action_activation = fn(action_activation)
+        return action_activation
+
+    def _f_epsilon(self, x: TensorType) -> TensorType:
+        return tf.math.sign(x) * tf.math.sqrt(tf.math.abs(x))
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/relative_multi_head_attention.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/relative_multi_head_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..f88486ff20516c19fcebdab3718fc829591215fb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/relative_multi_head_attention.py
@@ -0,0 +1,147 @@
+from typing import Optional
+
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import TensorType
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util import log_once
+
+tf1, tf, tfv = try_import_tf()
+
+
+class RelativeMultiHeadAttention(tf.keras.layers.Layer if tf else object):
+    """A RelativeMultiHeadAttention layer as described in [3].
+
+    Uses segment level recurrence with state reuse.
+    """
+
+    def __init__(
+        self,
+        out_dim: int,
+        num_heads: int,
+        head_dim: int,
+        input_layernorm: bool = False,
+        output_activation: Optional["tf.nn.activation"] = None,
+        **kwargs
+    ):
+        """Initializes a RelativeMultiHeadAttention keras Layer object.
+
+        Args:
+            out_dim: The output dimensions of the multi-head attention
+                unit.
+            num_heads: The number of attention heads to use.
+                Denoted `H` in [2].
+            head_dim: The dimension of a single(!) attention head within
+                a multi-head attention unit. Denoted as `d` in [3].
+            input_layernorm: Whether to prepend a LayerNorm before
+                everything else. Should be True for building a GTrXL.
+            output_activation (Optional[tf.nn.activation]): Optional tf.nn
+                activation function. Should be relu for GTrXL.
+            **kwargs:
+        """
+        if log_once("relative_multi_head_attention"):
+            deprecation_warning(
+                old="rllib.models.tf.layers.RelativeMultiHeadAttention",
+            )
+        super().__init__(**kwargs)
+
+        # No bias or non-linearity.
+        self._num_heads = num_heads
+        self._head_dim = head_dim
+        # 3=Query, key, and value inputs.
+        self._qkv_layer = tf.keras.layers.Dense(
+            3 * num_heads * head_dim, use_bias=False
+        )
+        self._linear_layer = tf.keras.layers.TimeDistributed(
+            tf.keras.layers.Dense(out_dim, use_bias=False, activation=output_activation)
+        )
+
+        self._uvar = self.add_weight(shape=(num_heads, head_dim))
+        self._vvar = self.add_weight(shape=(num_heads, head_dim))
+
+        # Constant (non-trainable) sinusoid rel pos encoding matrix, which
+        # depends on this incoming time dimension.
+        # For inference, we prepend the memory to the current timestep's
+        # input: Tau + 1. For training, we prepend the memory to the input
+        # sequence: Tau + T.
+        self._pos_embedding = PositionalEmbedding(out_dim)
+        self._pos_proj = tf.keras.layers.Dense(num_heads * head_dim, use_bias=False)
+
+        self._input_layernorm = None
+        if input_layernorm:
+            self._input_layernorm = tf.keras.layers.LayerNormalization(axis=-1)
+
+    def call(
+        self, inputs: TensorType, memory: Optional[TensorType] = None
+    ) -> TensorType:
+        T = tf.shape(inputs)[1]  # length of segment (time)
+        H = self._num_heads  # number of attention heads
+        d = self._head_dim  # attention head dimension
+
+        # Add previous memory chunk (as const, w/o gradient) to input.
+        # Tau (number of (prev) time slices in each memory chunk).
+        Tau = tf.shape(memory)[1]
+        inputs = tf.concat([tf.stop_gradient(memory), inputs], axis=1)
+
+        # Apply the Layer-Norm.
+        if self._input_layernorm is not None:
+            inputs = self._input_layernorm(inputs)
+
+        qkv = self._qkv_layer(inputs)
+
+        queries, keys, values = tf.split(qkv, 3, -1)
+        # Cut out memory timesteps from query.
+        queries = queries[:, -T:]
+
+        # Splitting up queries into per-head dims (d).
+        queries = tf.reshape(queries, [-1, T, H, d])
+        keys = tf.reshape(keys, [-1, Tau + T, H, d])
+        values = tf.reshape(values, [-1, Tau + T, H, d])
+
+        R = self._pos_embedding(Tau + T)
+        R = self._pos_proj(R)
+        R = tf.reshape(R, [Tau + T, H, d])
+
+        # b=batch
+        # i and j=time indices (i=max-timesteps (inputs); j=Tau memory space)
+        # h=head
+        # d=head-dim (over which we will reduce-sum)
+        score = tf.einsum("bihd,bjhd->bijh", queries + self._uvar, keys)
+        pos_score = tf.einsum("bihd,jhd->bijh", queries + self._vvar, R)
+        score = score + self.rel_shift(pos_score)
+        score = score / d**0.5
+
+        # Causal mask of the same length as the sequence.
+        mask = tf.sequence_mask(tf.range(Tau + 1, Tau + T + 1), dtype=score.dtype)
+        mask = mask[None, :, :, None]
+
+        masked_score = score * mask + 1e30 * (mask - 1.0)
+        wmat = tf.nn.softmax(masked_score, axis=2)
+
+        out = tf.einsum("bijh,bjhd->bihd", wmat, values)
+        out = tf.reshape(out, tf.concat((tf.shape(out)[:2], [H * d]), axis=0))
+        return self._linear_layer(out)
+
+    @staticmethod
+    def rel_shift(x: TensorType) -> TensorType:
+        # Transposed version of the shift approach described in [3].
+        # https://github.com/kimiyoung/transformer-xl/blob/
+        # 44781ed21dbaec88b280f74d9ae2877f52b492a5/tf/model.py#L31
+        x_size = tf.shape(x)
+
+        x = tf.pad(x, [[0, 0], [0, 0], [1, 0], [0, 0]])
+        x = tf.reshape(x, [x_size[0], x_size[2] + 1, x_size[1], x_size[3]])
+        x = x[:, 1:, :, :]
+        x = tf.reshape(x, x_size)
+
+        return x
+
+
+class PositionalEmbedding(tf.keras.layers.Layer if tf else object):
+    def __init__(self, out_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.inverse_freq = 1 / (10000 ** (tf.range(0, out_dim, 2.0) / out_dim))
+
+    def call(self, seq_length):
+        pos_offsets = tf.cast(tf.range(seq_length - 1, -1, -1), tf.float32)
+        inputs = pos_offsets[:, None] * self.inverse_freq[None, :]
+        return tf.concat((tf.sin(inputs), tf.cos(inputs)), axis=-1)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/skip_connection.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/skip_connection.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ee1751caf36e4a760da3c60c08fe279400dcb12
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/layers/skip_connection.py
@@ -0,0 +1,46 @@
+from typing import Optional, Any
+
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import TensorType
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util import log_once
+
+tf1, tf, tfv = try_import_tf()
+
+
+class SkipConnection(tf.keras.layers.Layer if tf else object):
+    """Skip connection layer.
+
+    Adds the original input to the output (regular residual layer) OR uses
+    input as hidden state input to a given fan_in_layer.
+    """
+
+    def __init__(self, layer: Any, fan_in_layer: Optional[Any] = None, **kwargs):
+        """Initializes a SkipConnection keras layer object.
+
+        Args:
+            layer (tf.keras.layers.Layer): Any layer processing inputs.
+            fan_in_layer (Optional[tf.keras.layers.Layer]): An optional
+                layer taking two inputs: The original input and the output
+                of `layer`.
+        """
+        if log_once("skip_connection"):
+            deprecation_warning(
+                old="rllib.models.tf.layers.SkipConnection",
+            )
+        super().__init__(**kwargs)
+        self._layer = layer
+        self._fan_in_layer = fan_in_layer
+
+    def call(self, inputs: TensorType, **kwargs) -> TensorType:
+        # del kwargs
+        outputs = self._layer(inputs, **kwargs)
+        # Residual case, just add inputs to outputs.
+        if self._fan_in_layer is None:
+            outputs = outputs + inputs
+        # Fan-in e.g. RNN: Call fan-in with `inputs` and `outputs`.
+        else:
+            # NOTE: In the GRU case, `inputs` is the state input.
+            outputs = self._fan_in_layer((inputs, outputs))
+
+        return outputs
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abbe5ef604646d8de477335e18c0bc4c88363b2c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__init__.py
@@ -0,0 +1,12 @@
+# from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+# from ray.rllib.models.torch.fcnet import FullyConnectedNetwork
+# from ray.rllib.models.torch.recurrent_net import \
+#     RecurrentNetwork
+# from ray.rllib.models.torch.visionnet import VisionNetwork
+
+# __all__ = [
+#     "FullyConnectedNetwork",
+#     "RecurrentNetwork",
+#     "TorchModelV2",
+#     "VisionNetwork",
+# ]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/misc.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/misc.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..daf29d6b7801d9d2f81fcd176f71fd815c7f5174
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/misc.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/visionnet.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/visionnet.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f771135e2383b78690ea04c835cceaa79d8e0061
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/__pycache__/visionnet.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/attention_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/attention_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..2382a4da1381a7dad748155498578009b13170e0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/attention_net.py
@@ -0,0 +1,457 @@
+"""
+[1] - Attention Is All You Need - Vaswani, Jones, Shazeer, Parmar,
+      Uszkoreit, Gomez, Kaiser - Google Brain/Research, U Toronto - 2017.
+      https://arxiv.org/pdf/1706.03762.pdf
+[2] - Stabilizing Transformers for Reinforcement Learning - E. Parisotto
+      et al. - DeepMind - 2019. https://arxiv.org/pdf/1910.06764.pdf
+[3] - Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context.
+      Z. Dai, Z. Yang, et al. - Carnegie Mellon U - 2019.
+      https://www.aclweb.org/anthology/P19-1285.pdf
+"""
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete, MultiDiscrete
+import numpy as np
+import tree  # pip install dm_tree
+from typing import Dict, Optional, Union
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.models.torch.modules import (
+    GRUGate,
+    RelativeMultiHeadAttention,
+    SkipConnection,
+)
+from ray.rllib.models.torch.recurrent_net import RecurrentNetwork
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.torch_utils import flatten_inputs_to_1d_tensor, one_hot
+from ray.rllib.utils.typing import ModelConfigDict, TensorType, List
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util import log_once
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class GTrXLNet(RecurrentNetwork, nn.Module):
+    """A GTrXL net Model described in [2].
+
+    This is still in an experimental phase.
+    Can be used as a drop-in replacement for LSTMs in PPO and IMPALA.
+
+    To use this network as a replacement for an RNN, configure your Algorithm
+    as follows:
+
+    Examples:
+        >> config["model"]["custom_model"] = GTrXLNet
+        >> config["model"]["max_seq_len"] = 10
+        >> config["model"]["custom_model_config"] = {
+        >>     num_transformer_units=1,
+        >>     attention_dim=32,
+        >>     num_heads=2,
+        >>     memory_tau=50,
+        >>     etc..
+        >> }
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: Optional[int],
+        model_config: ModelConfigDict,
+        name: str,
+        *,
+        num_transformer_units: int = 1,
+        attention_dim: int = 64,
+        num_heads: int = 2,
+        memory_inference: int = 50,
+        memory_training: int = 50,
+        head_dim: int = 32,
+        position_wise_mlp_dim: int = 32,
+        init_gru_gate_bias: float = 2.0
+    ):
+        """Initializes a GTrXLNet.
+
+        Args:
+            num_transformer_units: The number of Transformer repeats to
+                use (denoted L in [2]).
+            attention_dim: The input and output dimensions of one
+                Transformer unit.
+            num_heads: The number of attention heads to use in parallel.
+                Denoted as `H` in [3].
+            memory_inference: The number of timesteps to concat (time
+                axis) and feed into the next transformer unit as inference
+                input. The first transformer unit will receive this number of
+                past observations (plus the current one), instead.
+            memory_training: The number of timesteps to concat (time
+                axis) and feed into the next transformer unit as training
+                input (plus the actual input sequence of len=max_seq_len).
+                The first transformer unit will receive this number of
+                past observations (plus the input sequence), instead.
+            head_dim: The dimension of a single(!) attention head within
+                a multi-head attention unit. Denoted as `d` in [3].
+            position_wise_mlp_dim: The dimension of the hidden layer
+                within the position-wise MLP (after the multi-head attention
+                block within one Transformer unit). This is the size of the
+                first of the two layers within the PositionwiseFeedforward. The
+                second layer always has size=`attention_dim`.
+            init_gru_gate_bias: Initial bias values for the GRU gates
+                (two GRUs per Transformer unit, one after the MHA, one after
+                the position-wise MLP).
+        """
+        super().__init__(
+            observation_space, action_space, num_outputs, model_config, name
+        )
+
+        nn.Module.__init__(self)
+
+        self.num_transformer_units = num_transformer_units
+        self.attention_dim = attention_dim
+        self.num_heads = num_heads
+        self.memory_inference = memory_inference
+        self.memory_training = memory_training
+        self.head_dim = head_dim
+        self.max_seq_len = model_config["max_seq_len"]
+        self.obs_dim = observation_space.shape[0]
+
+        self.linear_layer = SlimFC(in_size=self.obs_dim, out_size=self.attention_dim)
+
+        self.layers = [self.linear_layer]
+
+        attention_layers = []
+        # 2) Create L Transformer blocks according to [2].
+        for i in range(self.num_transformer_units):
+            # RelativeMultiHeadAttention part.
+            MHA_layer = SkipConnection(
+                RelativeMultiHeadAttention(
+                    in_dim=self.attention_dim,
+                    out_dim=self.attention_dim,
+                    num_heads=num_heads,
+                    head_dim=head_dim,
+                    input_layernorm=True,
+                    output_activation=nn.ReLU,
+                ),
+                fan_in_layer=GRUGate(self.attention_dim, init_gru_gate_bias),
+            )
+
+            # Position-wise MultiLayerPerceptron part.
+            E_layer = SkipConnection(
+                nn.Sequential(
+                    torch.nn.LayerNorm(self.attention_dim),
+                    SlimFC(
+                        in_size=self.attention_dim,
+                        out_size=position_wise_mlp_dim,
+                        use_bias=False,
+                        activation_fn=nn.ReLU,
+                    ),
+                    SlimFC(
+                        in_size=position_wise_mlp_dim,
+                        out_size=self.attention_dim,
+                        use_bias=False,
+                        activation_fn=nn.ReLU,
+                    ),
+                ),
+                fan_in_layer=GRUGate(self.attention_dim, init_gru_gate_bias),
+            )
+
+            # Build a list of all attanlayers in order.
+            attention_layers.extend([MHA_layer, E_layer])
+
+        # Create a Sequential such that all parameters inside the attention
+        # layers are automatically registered with this top-level model.
+        self.attention_layers = nn.Sequential(*attention_layers)
+        self.layers.extend(attention_layers)
+
+        # Final layers if num_outputs not None.
+        self.logits = None
+        self.values_out = None
+        # Last value output.
+        self._value_out = None
+        # Postprocess GTrXL output with another hidden layer.
+        if self.num_outputs is not None:
+            self.logits = SlimFC(
+                in_size=self.attention_dim,
+                out_size=self.num_outputs,
+                activation_fn=nn.ReLU,
+            )
+
+            # Value function used by all RLlib Torch RL implementations.
+            self.values_out = SlimFC(
+                in_size=self.attention_dim, out_size=1, activation_fn=None
+            )
+        else:
+            self.num_outputs = self.attention_dim
+
+        # Setup trajectory views (`memory-inference` x past memory outs).
+        for i in range(self.num_transformer_units):
+            space = Box(-1.0, 1.0, shape=(self.attention_dim,))
+            self.view_requirements["state_in_{}".format(i)] = ViewRequirement(
+                "state_out_{}".format(i),
+                shift="-{}:-1".format(self.memory_inference),
+                # Repeat the incoming state every max-seq-len times.
+                batch_repeat_value=self.max_seq_len,
+                space=space,
+            )
+            self.view_requirements["state_out_{}".format(i)] = ViewRequirement(
+                space=space, used_for_training=False
+            )
+
+    @override(ModelV2)
+    def forward(
+        self, input_dict, state: List[TensorType], seq_lens: TensorType
+    ) -> (TensorType, List[TensorType]):
+        assert seq_lens is not None
+
+        # Add the needed batch rank (tf Models' Input requires this).
+        observations = input_dict[SampleBatch.OBS]
+        # Add the time dim to observations.
+        B = len(seq_lens)
+        T = observations.shape[0] // B
+        observations = torch.reshape(
+            observations, [-1, T] + list(observations.shape[1:])
+        )
+
+        all_out = observations
+        memory_outs = []
+        for i in range(len(self.layers)):
+            # MHA layers which need memory passed in.
+            if i % 2 == 1:
+                all_out = self.layers[i](all_out, memory=state[i // 2])
+            # Either self.linear_layer (initial obs -> attn. dim layer) or
+            # MultiLayerPerceptrons. The output of these layers is always the
+            # memory for the next forward pass.
+            else:
+                all_out = self.layers[i](all_out)
+                memory_outs.append(all_out)
+
+        # Discard last output (not needed as a memory since it's the last
+        # layer).
+        memory_outs = memory_outs[:-1]
+
+        if self.logits is not None:
+            out = self.logits(all_out)
+            self._value_out = self.values_out(all_out)
+            out_dim = self.num_outputs
+        else:
+            out = all_out
+            out_dim = self.attention_dim
+
+        return torch.reshape(out, [-1, out_dim]), [
+            torch.reshape(m, [-1, self.attention_dim]) for m in memory_outs
+        ]
+
+    # TODO: (sven) Deprecate this once trajectory view API has fully matured.
+    @override(RecurrentNetwork)
+    def get_initial_state(self) -> List[np.ndarray]:
+        return []
+
+    @override(ModelV2)
+    def value_function(self) -> TensorType:
+        assert (
+            self._value_out is not None
+        ), "Must call forward first AND must have value branch!"
+        return torch.reshape(self._value_out, [-1])
+
+
+class AttentionWrapper(TorchModelV2, nn.Module):
+    """GTrXL wrapper serving as interface for ModelV2s that set use_attention."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        if log_once("deprecate_attention_wrapper_torch"):
+            deprecation_warning(
+                old="ray.rllib.models.torch.attention_net.AttentionWrapper"
+            )
+
+        nn.Module.__init__(self)
+        super().__init__(obs_space, action_space, None, model_config, name)
+
+        self.use_n_prev_actions = model_config["attention_use_n_prev_actions"]
+        self.use_n_prev_rewards = model_config["attention_use_n_prev_rewards"]
+
+        self.action_space_struct = get_base_struct_from_space(self.action_space)
+        self.action_dim = 0
+
+        for space in tree.flatten(self.action_space_struct):
+            if isinstance(space, Discrete):
+                self.action_dim += space.n
+            elif isinstance(space, MultiDiscrete):
+                self.action_dim += np.sum(space.nvec)
+            elif space.shape is not None:
+                self.action_dim += int(np.prod(space.shape))
+            else:
+                self.action_dim += int(len(space))
+
+        # Add prev-action/reward nodes to input to LSTM.
+        if self.use_n_prev_actions:
+            self.num_outputs += self.use_n_prev_actions * self.action_dim
+        if self.use_n_prev_rewards:
+            self.num_outputs += self.use_n_prev_rewards
+
+        cfg = model_config
+
+        self.attention_dim = cfg["attention_dim"]
+
+        if self.num_outputs is not None:
+            in_space = gym.spaces.Box(
+                float("-inf"), float("inf"), shape=(self.num_outputs,), dtype=np.float32
+            )
+        else:
+            in_space = obs_space
+
+        # Construct GTrXL sub-module w/ num_outputs=None (so it does not
+        # create a logits/value output; we'll do this ourselves in this wrapper
+        # here).
+        self.gtrxl = GTrXLNet(
+            in_space,
+            action_space,
+            None,
+            model_config,
+            "gtrxl",
+            num_transformer_units=cfg["attention_num_transformer_units"],
+            attention_dim=self.attention_dim,
+            num_heads=cfg["attention_num_heads"],
+            head_dim=cfg["attention_head_dim"],
+            memory_inference=cfg["attention_memory_inference"],
+            memory_training=cfg["attention_memory_training"],
+            position_wise_mlp_dim=cfg["attention_position_wise_mlp_dim"],
+            init_gru_gate_bias=cfg["attention_init_gru_gate_bias"],
+        )
+
+        # Set final num_outputs to correct value (depending on action space).
+        self.num_outputs = num_outputs
+
+        # Postprocess GTrXL output with another hidden layer and compute
+        # values.
+        self._logits_branch = SlimFC(
+            in_size=self.attention_dim,
+            out_size=self.num_outputs,
+            activation_fn=None,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+        self._value_branch = SlimFC(
+            in_size=self.attention_dim,
+            out_size=1,
+            activation_fn=None,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+
+        self.view_requirements = self.gtrxl.view_requirements
+        self.view_requirements["obs"].space = self.obs_space
+
+        # Add prev-a/r to this model's view, if required.
+        if self.use_n_prev_actions:
+            self.view_requirements[SampleBatch.PREV_ACTIONS] = ViewRequirement(
+                SampleBatch.ACTIONS,
+                space=self.action_space,
+                shift="-{}:-1".format(self.use_n_prev_actions),
+            )
+        if self.use_n_prev_rewards:
+            self.view_requirements[SampleBatch.PREV_REWARDS] = ViewRequirement(
+                SampleBatch.REWARDS, shift="-{}:-1".format(self.use_n_prev_rewards)
+            )
+
+    @override(RecurrentNetwork)
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> (TensorType, List[TensorType]):
+        assert seq_lens is not None
+        # Push obs through "unwrapped" net's `forward()` first.
+        wrapped_out, _ = self._wrapped_forward(input_dict, [], None)
+
+        # Concat. prev-action/reward if required.
+        prev_a_r = []
+
+        # Prev actions.
+        if self.use_n_prev_actions:
+            prev_n_actions = input_dict[SampleBatch.PREV_ACTIONS]
+            # If actions are not processed yet (in their original form as
+            # have been sent to environment):
+            # Flatten/one-hot into 1D array.
+            if self.model_config["_disable_action_flattening"]:
+                # Merge prev n actions into flat tensor.
+                flat = flatten_inputs_to_1d_tensor(
+                    prev_n_actions,
+                    spaces_struct=self.action_space_struct,
+                    time_axis=True,
+                )
+                # Fold time-axis into flattened data.
+                flat = torch.reshape(flat, [flat.shape[0], -1])
+                prev_a_r.append(flat)
+            # If actions are already flattened (but not one-hot'd yet!),
+            # one-hot discrete/multi-discrete actions here and concatenate the
+            # n most recent actions together.
+            else:
+                if isinstance(self.action_space, Discrete):
+                    for i in range(self.use_n_prev_actions):
+                        prev_a_r.append(
+                            one_hot(
+                                prev_n_actions[:, i].float(), space=self.action_space
+                            )
+                        )
+                elif isinstance(self.action_space, MultiDiscrete):
+                    for i in range(
+                        0, self.use_n_prev_actions, self.action_space.shape[0]
+                    ):
+                        prev_a_r.append(
+                            one_hot(
+                                prev_n_actions[
+                                    :, i : i + self.action_space.shape[0]
+                                ].float(),
+                                space=self.action_space,
+                            )
+                        )
+                else:
+                    prev_a_r.append(
+                        torch.reshape(
+                            prev_n_actions.float(),
+                            [-1, self.use_n_prev_actions * self.action_dim],
+                        )
+                    )
+        # Prev rewards.
+        if self.use_n_prev_rewards:
+            prev_a_r.append(
+                torch.reshape(
+                    input_dict[SampleBatch.PREV_REWARDS].float(),
+                    [-1, self.use_n_prev_rewards],
+                )
+            )
+
+        # Concat prev. actions + rewards to the "main" input.
+        if prev_a_r:
+            wrapped_out = torch.cat([wrapped_out] + prev_a_r, dim=1)
+
+        # Then through our GTrXL.
+        input_dict["obs_flat"] = input_dict["obs"] = wrapped_out
+
+        self._features, memory_outs = self.gtrxl(input_dict, state, seq_lens)
+        model_out = self._logits_branch(self._features)
+        return model_out, memory_outs
+
+    @override(ModelV2)
+    def get_initial_state(self) -> Union[List[np.ndarray], List[TensorType]]:
+        return [
+            torch.zeros(
+                self.gtrxl.view_requirements["state_in_{}".format(i)].space.shape
+            )
+            for i in range(self.gtrxl.num_transformer_units)
+        ]
+
+    @override(ModelV2)
+    def value_function(self) -> TensorType:
+        assert self._features is not None, "Must call forward() first!"
+        return torch.reshape(self._value_branch(self._features), [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/complex_input_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/complex_input_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c81dba790c4f97f2c6ab5d2b765af0691005c8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/complex_input_net.py
@@ -0,0 +1,237 @@
+from gymnasium.spaces import Box, Discrete, MultiDiscrete
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.models.torch.misc import (
+    normc_initializer as torch_normc_initializer,
+    SlimFC,
+)
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.modelv2 import ModelV2, restore_original_dimensions
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.utils import get_filter_config
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.spaces.space_utils import flatten_space
+from ray.rllib.utils.torch_utils import one_hot
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class ComplexInputNetwork(TorchModelV2, nn.Module):
+    """TorchModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s).
+
+    Note: This model should be used for complex (Dict or Tuple) observation
+    spaces that have one or more image components.
+
+    The data flow is as follows:
+
+    `obs` (e.g. Tuple[img0, img1, discrete0]) -> `CNN0 + CNN1 + ONE-HOT`
+    `CNN0 + CNN1 + ONE-HOT` -> concat all flat outputs -> `out`
+    `out` -> (optional) FC-stack -> `out2`
+    `out2` -> action (logits) and value heads.
+    """
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        self.original_space = (
+            obs_space.original_space
+            if hasattr(obs_space, "original_space")
+            else obs_space
+        )
+
+        self.processed_obs_space = (
+            self.original_space
+            if model_config.get("_disable_preprocessor_api")
+            else obs_space
+        )
+
+        nn.Module.__init__(self)
+        TorchModelV2.__init__(
+            self, self.original_space, action_space, num_outputs, model_config, name
+        )
+
+        self.flattened_input_space = flatten_space(self.original_space)
+
+        # Atari type CNNs or IMPALA type CNNs (with residual layers)?
+        # self.cnn_type = self.model_config["custom_model_config"].get(
+        #     "conv_type", "atari")
+
+        # Build the CNN(s) given obs_space's image components.
+        self.cnns = nn.ModuleDict()
+        self.one_hot = nn.ModuleDict()
+        self.flatten_dims = {}
+        self.flatten = nn.ModuleDict()
+        concat_size = 0
+        for i, component in enumerate(self.flattened_input_space):
+            i = str(i)
+            # Image space.
+            if len(component.shape) == 3 and isinstance(component, Box):
+                config = {
+                    "conv_filters": model_config["conv_filters"]
+                    if "conv_filters" in model_config
+                    else get_filter_config(component.shape),
+                    "conv_activation": model_config.get("conv_activation"),
+                    "post_fcnet_hiddens": [],
+                }
+                # if self.cnn_type == "atari":
+                self.cnns[i] = ModelCatalog.get_model_v2(
+                    component,
+                    action_space,
+                    num_outputs=None,
+                    model_config=config,
+                    framework="torch",
+                    name="cnn_{}".format(i),
+                )
+                # TODO (sven): add IMPALA-style option.
+                # else:
+                #    cnn = TorchImpalaVisionNet(
+                #        component,
+                #        action_space,
+                #        num_outputs=None,
+                #        model_config=config,
+                #        name="cnn_{}".format(i))
+
+                concat_size += self.cnns[i].num_outputs
+                self.add_module("cnn_{}".format(i), self.cnns[i])
+            # Discrete|MultiDiscrete inputs -> One-hot encode.
+            elif isinstance(component, (Discrete, MultiDiscrete)):
+                if isinstance(component, Discrete):
+                    size = component.n
+                else:
+                    size = np.sum(component.nvec)
+                config = {
+                    "fcnet_hiddens": model_config["fcnet_hiddens"],
+                    "fcnet_activation": model_config.get("fcnet_activation"),
+                    "post_fcnet_hiddens": [],
+                }
+                self.one_hot[i] = ModelCatalog.get_model_v2(
+                    Box(-1.0, 1.0, (size,), np.float32),
+                    action_space,
+                    num_outputs=None,
+                    model_config=config,
+                    framework="torch",
+                    name="one_hot_{}".format(i),
+                )
+                concat_size += self.one_hot[i].num_outputs
+                self.add_module("one_hot_{}".format(i), self.one_hot[i])
+            # Everything else (1D Box).
+            else:
+                size = int(np.prod(component.shape))
+                config = {
+                    "fcnet_hiddens": model_config["fcnet_hiddens"],
+                    "fcnet_activation": model_config.get("fcnet_activation"),
+                    "post_fcnet_hiddens": [],
+                }
+                self.flatten[i] = ModelCatalog.get_model_v2(
+                    Box(-1.0, 1.0, (size,), np.float32),
+                    action_space,
+                    num_outputs=None,
+                    model_config=config,
+                    framework="torch",
+                    name="flatten_{}".format(i),
+                )
+                self.flatten_dims[i] = size
+                concat_size += self.flatten[i].num_outputs
+                self.add_module("flatten_{}".format(i), self.flatten[i])
+
+        # Optional post-concat FC-stack.
+        post_fc_stack_config = {
+            "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []),
+            "fcnet_activation": model_config.get("post_fcnet_activation", "relu"),
+        }
+        self.post_fc_stack = ModelCatalog.get_model_v2(
+            Box(float("-inf"), float("inf"), shape=(concat_size,), dtype=np.float32),
+            self.action_space,
+            None,
+            post_fc_stack_config,
+            framework="torch",
+            name="post_fc_stack",
+        )
+
+        # Actions and value heads.
+        self.logits_layer = None
+        self.value_layer = None
+        self._value_out = None
+
+        if num_outputs:
+            # Action-distribution head.
+            self.logits_layer = SlimFC(
+                in_size=self.post_fc_stack.num_outputs,
+                out_size=num_outputs,
+                activation_fn=None,
+                initializer=torch_normc_initializer(0.01),
+            )
+            # Create the value branch model.
+            self.value_layer = SlimFC(
+                in_size=self.post_fc_stack.num_outputs,
+                out_size=1,
+                activation_fn=None,
+                initializer=torch_normc_initializer(0.01),
+            )
+        else:
+            self.num_outputs = concat_size
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        if SampleBatch.OBS in input_dict and "obs_flat" in input_dict:
+            orig_obs = input_dict[SampleBatch.OBS]
+        else:
+            orig_obs = restore_original_dimensions(
+                input_dict[SampleBatch.OBS], self.processed_obs_space, tensorlib="torch"
+            )
+        # Push observations through the different components
+        # (CNNs, one-hot + FC, etc..).
+        outs = []
+        for i, component in enumerate(tree.flatten(orig_obs)):
+            i = str(i)
+            if i in self.cnns:
+                cnn_out, _ = self.cnns[i](SampleBatch({SampleBatch.OBS: component}))
+                outs.append(cnn_out)
+            elif i in self.one_hot:
+                if component.dtype in [
+                    torch.int8,
+                    torch.int16,
+                    torch.int32,
+                    torch.int64,
+                    torch.uint8,
+                ]:
+                    one_hot_in = {
+                        SampleBatch.OBS: one_hot(
+                            component, self.flattened_input_space[int(i)]
+                        )
+                    }
+                else:
+                    one_hot_in = {SampleBatch.OBS: component}
+                one_hot_out, _ = self.one_hot[i](SampleBatch(one_hot_in))
+                outs.append(one_hot_out)
+            else:
+                nn_out, _ = self.flatten[i](
+                    SampleBatch(
+                        {
+                            SampleBatch.OBS: torch.reshape(
+                                component, [-1, self.flatten_dims[i]]
+                            )
+                        }
+                    )
+                )
+                outs.append(nn_out)
+
+        # Concat all outputs and the non-image inputs.
+        out = torch.cat(outs, dim=1)
+        # Push through (optional) FC-stack (this may be an empty stack).
+        out, _ = self.post_fc_stack(SampleBatch({SampleBatch.OBS: out}))
+
+        # No logits/value branches.
+        if self.logits_layer is None:
+            return out, []
+
+        # Logits- and value branches.
+        logits, values = self.logits_layer(out), self.value_layer(out)
+        self._value_out = torch.reshape(values, [-1])
+        return logits, []
+
+    @override(ModelV2)
+    def value_function(self):
+        return self._value_out
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/fcnet.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/fcnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ba907a54ed09b207c62a161ab54a27f89a281f4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/fcnet.py
@@ -0,0 +1,160 @@
+import logging
+import numpy as np
+import gymnasium as gym
+
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.misc import SlimFC, AppendBiasLayer, normc_initializer
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import Dict, TensorType, List, ModelConfigDict
+
+torch, nn = try_import_torch()
+
+logger = logging.getLogger(__name__)
+
+
+@OldAPIStack
+class FullyConnectedNetwork(TorchModelV2, nn.Module):
+    """Generic fully connected network."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        TorchModelV2.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        hiddens = list(model_config.get("fcnet_hiddens", [])) + list(
+            model_config.get("post_fcnet_hiddens", [])
+        )
+        activation = model_config.get("fcnet_activation")
+        if not model_config.get("fcnet_hiddens", []):
+            activation = model_config.get("post_fcnet_activation")
+        no_final_linear = model_config.get("no_final_linear")
+        self.vf_share_layers = model_config.get("vf_share_layers")
+        self.free_log_std = model_config.get("free_log_std")
+        # Generate free-floating bias variables for the second half of
+        # the outputs.
+        if self.free_log_std:
+            assert num_outputs % 2 == 0, (
+                "num_outputs must be divisible by two",
+                num_outputs,
+            )
+            num_outputs = num_outputs // 2
+
+        layers = []
+        prev_layer_size = int(np.prod(obs_space.shape))
+        self._logits = None
+
+        # Create layers 0 to second-last.
+        for size in hiddens[:-1]:
+            layers.append(
+                SlimFC(
+                    in_size=prev_layer_size,
+                    out_size=size,
+                    initializer=normc_initializer(1.0),
+                    activation_fn=activation,
+                )
+            )
+            prev_layer_size = size
+
+        # The last layer is adjusted to be of size num_outputs, but it's a
+        # layer with activation.
+        if no_final_linear and num_outputs:
+            layers.append(
+                SlimFC(
+                    in_size=prev_layer_size,
+                    out_size=num_outputs,
+                    initializer=normc_initializer(1.0),
+                    activation_fn=activation,
+                )
+            )
+            prev_layer_size = num_outputs
+        # Finish the layers with the provided sizes (`hiddens`), plus -
+        # iff num_outputs > 0 - a last linear layer of size num_outputs.
+        else:
+            if len(hiddens) > 0:
+                layers.append(
+                    SlimFC(
+                        in_size=prev_layer_size,
+                        out_size=hiddens[-1],
+                        initializer=normc_initializer(1.0),
+                        activation_fn=activation,
+                    )
+                )
+                prev_layer_size = hiddens[-1]
+            if num_outputs:
+                self._logits = SlimFC(
+                    in_size=prev_layer_size,
+                    out_size=num_outputs,
+                    initializer=normc_initializer(0.01),
+                    activation_fn=None,
+                )
+            else:
+                self.num_outputs = ([int(np.prod(obs_space.shape))] + hiddens[-1:])[-1]
+
+        # Layer to add the log std vars to the state-dependent means.
+        if self.free_log_std and self._logits:
+            self._append_free_log_std = AppendBiasLayer(num_outputs)
+
+        self._hidden_layers = nn.Sequential(*layers)
+
+        self._value_branch_separate = None
+        if not self.vf_share_layers:
+            # Build a parallel set of hidden layers for the value net.
+            prev_vf_layer_size = int(np.prod(obs_space.shape))
+            vf_layers = []
+            for size in hiddens:
+                vf_layers.append(
+                    SlimFC(
+                        in_size=prev_vf_layer_size,
+                        out_size=size,
+                        activation_fn=activation,
+                        initializer=normc_initializer(1.0),
+                    )
+                )
+                prev_vf_layer_size = size
+            self._value_branch_separate = nn.Sequential(*vf_layers)
+
+        self._value_branch = SlimFC(
+            in_size=prev_layer_size,
+            out_size=1,
+            initializer=normc_initializer(0.01),
+            activation_fn=None,
+        )
+        # Holds the current "base" output (before logits layer).
+        self._features = None
+        # Holds the last input, in case value branch is separate.
+        self._last_flat_in = None
+
+    @override(TorchModelV2)
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> (TensorType, List[TensorType]):
+        obs = input_dict["obs_flat"].float()
+        self._last_flat_in = obs.reshape(obs.shape[0], -1)
+        self._features = self._hidden_layers(self._last_flat_in)
+        logits = self._logits(self._features) if self._logits else self._features
+        if self.free_log_std:
+            logits = self._append_free_log_std(logits)
+        return logits, state
+
+    @override(TorchModelV2)
+    def value_function(self) -> TensorType:
+        assert self._features is not None, "must call forward() first"
+        if self._value_branch_separate:
+            out = self._value_branch(
+                self._value_branch_separate(self._last_flat_in)
+            ).squeeze(1)
+        else:
+            out = self._value_branch(self._features).squeeze(1)
+        return out
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/mingpt.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/mingpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bf54aa2fe8e5c4a88c1dc9ba7ca700f9659405b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/mingpt.py
@@ -0,0 +1,303 @@
+# LICENSE: MIT
+"""
+Adapted from https://github.com/karpathy/minGPT
+
+Full definition of a GPT Language Model, all of it in this single file.
+References:
+1) the official GPT-2 TensorFlow implementation released by OpenAI:
+https://github.com/openai/gpt-2/blob/master/src/model.py
+2) huggingface/transformers PyTorch implementation:
+https://github.com/huggingface/transformers/blob/main/src/transformers
+        /models/gpt2/modeling_gpt2.py
+"""
+
+import math
+from dataclasses import dataclass
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.deprecation import Deprecated
+
+
+@DeveloperAPI
+@dataclass
+class GPTConfig:
+    # block size must be provided
+    block_size: int
+
+    # transformer config
+    n_layer: int = 12
+    n_head: int = 12
+    n_embed: int = 768
+
+    # dropout config
+    embed_pdrop: float = 0.1
+    resid_pdrop: float = 0.1
+    attn_pdrop: float = 0.1
+
+
+@Deprecated(error=False)
+class NewGELU(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT
+    repo (identical to OpenAI GPT).
+    Reference: Gaussian Error Linear Units (GELU) paper:
+    https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, x):
+        return (
+            0.5
+            * x
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
+                )
+            )
+        )
+
+
+@Deprecated(error=False)
+class CausalSelfAttention(nn.Module):
+    """
+    Vanilla multi-head masked self-attention layer with a projection at the end.
+    It is possible to use torch.nn.MultiheadAttention here but I am including an
+    explicit implementation here to show that there is nothing too scary here.
+    """
+
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        assert config.n_embed % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embed, config.n_embed)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        # causal mask to ensure that attention is only applied to the left
+        # in the input sequence
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                1, 1, config.block_size, config.block_size
+            ),
+        )
+        self.n_head = config.n_head
+        self.n_embed = config.n_embed
+
+    def forward(self, x, attention_masks=None):
+        # batch size, sequence length, embedding dimensionality (n_embed)
+        B, T, C = x.size()
+
+        # calculate query, key, values for all heads in batch and move head
+        # forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embed, dim=2)
+        # (B, nh, T, hs)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+
+        # causal self-attention; Self-attend:
+        # (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+        if attention_masks is not None:
+            att = att + attention_masks
+        att = F.softmax(att, dim=-1)
+        att = self.attn_dropout(att)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        # re-assemble all head outputs side by side
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y, att
+
+
+@Deprecated(error=False)
+class Block(nn.Module):
+    """an unassuming Transformer block"""
+
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embed)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embed)
+        self.mlp = nn.ModuleDict(
+            dict(
+                c_fc=nn.Linear(config.n_embed, 4 * config.n_embed),
+                c_proj=nn.Linear(4 * config.n_embed, config.n_embed),
+                act=NewGELU(),
+                dropout=nn.Dropout(config.resid_pdrop),
+            )
+        )
+
+    def forward(self, x, attention_masks=None):
+        # Multi-head attention sub-layer.
+        x_att, att = self.attn(self.ln_1(x), attention_masks=attention_masks)
+        # Residual of multi-head attention sub-layer.
+        x = x + x_att
+
+        # Position-wise FFN sub-layer: fc + activation + fc + dropout
+        x_ffn = self.mlp.dropout(self.mlp.c_proj(self.mlp.act(self.mlp.c_fc(x))))
+        # Residual of position-wise FFN sub-layer.
+        x = x + x_ffn
+        return x, att
+
+
+@Deprecated(error=False)
+def configure_gpt_optimizer(
+    model: nn.Module,
+    learning_rate: float,
+    weight_decay: float,
+    betas: Tuple[float, float] = (0.9, 0.95),
+    **kwargs,
+) -> torch.optim.Optimizer:
+    """
+    This long function is unfortunately doing something very simple and is
+    being very defensive: We are separating out all parameters of the model
+    into two buckets: those that will experience weight decay for regularization
+    and those that won't (biases, and layernorm/embedding weights). We are then
+    returning the PyTorch optimizer object.
+    """
+
+    # separate out all parameters to those that will and won't experience
+    # regularizing weight decay
+    decay = set()
+    no_decay = set()
+    whitelist_w_modules = (torch.nn.Linear,)
+    blacklist_w_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+    for mn, m in model.named_modules():
+        for pn, p in m.named_parameters():
+            fpn = "%s.%s" % (mn, pn) if mn else pn  # full param name
+            # random note: because named_modules and named_parameters are
+            # recursive we will see the same tensors p many many times. but
+            # doing it this way allows us to know which parent module any
+            # tensor p belongs to...
+            if pn.endswith("bias"):
+                # all biases will not be decayed
+                no_decay.add(fpn)
+            elif pn.endswith("weight") and isinstance(m, whitelist_w_modules):
+                # weights of whitelist modules will be weight decayed
+                decay.add(fpn)
+            elif pn.endswith("weight") and isinstance(m, blacklist_w_modules):
+                # weights of blacklist modules will NOT be weight decayed
+                no_decay.add(fpn)
+
+    # validate that we considered every parameter
+    param_dict = {pn: p for pn, p in model.named_parameters()}
+    inter_params = decay & no_decay
+    union_params = decay | no_decay
+    assert (
+        len(inter_params) == 0
+    ), f"parameters {str(inter_params)} made it into both decay/no_decay sets!"
+    assert len(param_dict.keys() - union_params) == 0, (
+        f"parameters {str(param_dict.keys() - union_params)} were not "
+        f"separated into either decay/no_decay set!"
+    )
+
+    # create the pytorch optimizer object
+    optim_groups = [
+        {
+            "params": [param_dict[pn] for pn in sorted(decay)],
+            "weight_decay": weight_decay,
+        },
+        {
+            "params": [param_dict[pn] for pn in sorted(no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **kwargs)
+    return optimizer
+
+
+@Deprecated(error=False)
+class GPT(nn.Module):
+    """GPT Transformer Model"""
+
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        assert config.block_size is not None
+        self.block_size = config.block_size
+
+        self.transformer = nn.ModuleDict(
+            dict(
+                drop=nn.Dropout(config.embed_pdrop),
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=nn.LayerNorm(config.n_embed),
+            )
+        )
+
+        # init all weights, and apply a special scaled init to the residual
+        # projections, per GPT-2 paper
+        self.apply(self._init_weights)
+        for pn, p in self.named_parameters():
+            if pn.endswith("c_proj.weight"):
+                torch.nn.init.normal_(
+                    p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer)
+                )
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        elif isinstance(module, nn.LayerNorm):
+            torch.nn.init.zeros_(module.bias)
+            torch.nn.init.ones_(module.weight)
+
+    def forward(self, input_embeds, attention_masks=None, return_attentions=False):
+        """
+        input_embeds: [batch_size x seq_len x n_embed]
+        attention_masks: [batch_size x seq_len], 0 don't attend, 1 attend
+        """
+        B, T, C = input_embeds.size()
+        assert T <= self.block_size, (
+            f"Cannot forward sequence of length {T}, "
+            f"block size is only {self.block_size}"
+        )
+
+        if attention_masks is not None:
+            _B, _T = attention_masks.size()
+            assert _B == B and _T == T
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_len]
+            # So we can broadcast to
+            # [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular
+            # masking of causal attention used in OpenAI GPT, we just need
+            # to prepare the broadcast dimension here.
+            attention_masks = attention_masks[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend
+            # and 0.0 for masked positions, this operation will create a
+            # tensor which is 0.0 for positions we want to attend and -inf
+            # for masked positions. Since we are adding it to the raw scores
+            # before the softmax, this is effectively the same as removing
+            # these entirely.
+            attention_masks = attention_masks.to(dtype=input_embeds.dtype)
+            attention_masks = (1.0 - attention_masks) * -1e9
+
+        # forward the GPT model itself
+        x = self.transformer.drop(input_embeds)
+
+        atts = []
+        for block in self.transformer.h:
+            x, att = block(x, attention_masks=attention_masks)
+            atts.append(att)
+        x = self.transformer.ln_f(x)
+
+        if return_attentions:
+            return x, atts
+        else:
+            return x
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/misc.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5850eba0a3df007499bfc7caf65a1ab46c6b6913
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/misc.py
@@ -0,0 +1,323 @@
+""" Code adapted from https://github.com/ikostrikov/pytorch-a3c"""
+import numpy as np
+from typing import Union, Tuple, Any, List
+
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+
+@DeveloperAPI
+def normc_initializer(std: float = 1.0) -> Any:
+    def initializer(tensor):
+        tensor.data.normal_(0, 1)
+        tensor.data *= std / torch.sqrt(tensor.data.pow(2).sum(1, keepdim=True))
+
+    return initializer
+
+
+@DeveloperAPI
+def same_padding(
+    in_size: Tuple[int, int],
+    filter_size: Union[int, Tuple[int, int]],
+    stride_size: Union[int, Tuple[int, int]],
+) -> (Union[int, Tuple[int, int]], Tuple[int, int]):
+    """Note: Padding is added to match TF conv2d `same` padding.
+
+    See www.tensorflow.org/versions/r0.12/api_docs/python/nn/convolution
+
+    Args:
+        in_size: Rows (Height), Column (Width) for input
+        stride_size (Union[int,Tuple[int, int]]): Rows (Height), column (Width)
+            for stride. If int, height == width.
+        filter_size: Rows (Height), column (Width) for filter
+
+    Returns:
+        padding: For input into torch.nn.ZeroPad2d.
+        output: Output shape after padding and convolution.
+    """
+    in_height, in_width = in_size
+    if isinstance(filter_size, int):
+        filter_height, filter_width = filter_size, filter_size
+    else:
+        filter_height, filter_width = filter_size
+    if isinstance(stride_size, (int, float)):
+        stride_height, stride_width = int(stride_size), int(stride_size)
+    else:
+        stride_height, stride_width = int(stride_size[0]), int(stride_size[1])
+
+    out_height = int(np.ceil(float(in_height) / float(stride_height)))
+    out_width = int(np.ceil(float(in_width) / float(stride_width)))
+
+    pad_along_height = int((out_height - 1) * stride_height + filter_height - in_height)
+    pad_along_width = int((out_width - 1) * stride_width + filter_width - in_width)
+    pad_top = pad_along_height // 2
+    pad_bottom = pad_along_height - pad_top
+    pad_left = pad_along_width // 2
+    pad_right = pad_along_width - pad_left
+    padding = (pad_left, pad_right, pad_top, pad_bottom)
+    output = (out_height, out_width)
+    return padding, output
+
+
+@DeveloperAPI
+def same_padding_transpose_after_stride(
+    strided_size: Tuple[int, int],
+    kernel: Tuple[int, int],
+    stride: Union[int, Tuple[int, int]],
+) -> (Union[int, Tuple[int, int]], Tuple[int, int]):
+    """Computes padding and output size such that TF Conv2DTranspose `same` is matched.
+
+    Note that when padding="same", TensorFlow's Conv2DTranspose makes sure that
+    0-padding is added to the already strided image in such a way that the output image
+    has the same size as the input image times the stride (and no matter the
+    kernel size).
+
+    For example: Input image is (4, 4, 24) (not yet strided), padding is "same",
+    stride=2, kernel=5.
+
+    First, the input image is strided (with stride=2):
+
+    Input image (4x4):
+    A B C D
+    E F G H
+    I J K L
+    M N O P
+
+    Stride with stride=2 -> (7x7)
+    A 0 B 0 C 0 D
+    0 0 0 0 0 0 0
+    E 0 F 0 G 0 H
+    0 0 0 0 0 0 0
+    I 0 J 0 K 0 L
+    0 0 0 0 0 0 0
+    M 0 N 0 O 0 P
+
+    Then this strided image (strided_size=7x7) is padded (exact padding values will be
+    output by this function):
+
+    padding -> (left=3, right=2, top=3, bottom=2)
+
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 A 0 B 0 C 0 D 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 E 0 F 0 G 0 H 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 I 0 J 0 K 0 L 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 M 0 N 0 O 0 P 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+    0 0 0 0 0 0 0 0 0 0 0 0
+
+    Then deconvolution with kernel=5 yields an output image of 8x8 (x num output
+    filters).
+
+    Args:
+        strided_size: The size (width x height) of the already strided image.
+        kernel: Either width x height (tuple of ints) or - if a square kernel is used -
+            a single int for both width and height.
+        stride: Either stride width x stride height (tuple of ints) or - if square
+            striding is used - a single int for both width- and height striding.
+
+    Returns:
+        Tuple consisting of 1) `padding`: A 4-tuple to pad the input after(!) striding.
+        The values are for left, right, top, and bottom padding, individually.
+        This 4-tuple can be used in a torch.nn.ZeroPad2d layer, and 2) the output shape
+        after striding, padding, and the conv transpose layer.
+    """
+
+    # Solve single int (squared) inputs for kernel and/or stride.
+    k_w, k_h = (kernel, kernel) if isinstance(kernel, int) else kernel
+    s_w, s_h = (stride, stride) if isinstance(stride, int) else stride
+
+    # Compute the total size of the 0-padding on both axes. If results are odd numbers,
+    # the padding on e.g. left and right (or top and bottom) side will have to differ
+    # by 1.
+    pad_total_w, pad_total_h = k_w - 1 + s_w - 1, k_h - 1 + s_h - 1
+    pad_right = pad_total_w // 2
+    pad_left = pad_right + (1 if pad_total_w % 2 == 1 else 0)
+    pad_bottom = pad_total_h // 2
+    pad_top = pad_bottom + (1 if pad_total_h % 2 == 1 else 0)
+
+    # Compute the output size.
+    output_shape = (
+        strided_size[0] + pad_total_w - k_w + 1,
+        strided_size[1] + pad_total_h - k_h + 1,
+    )
+
+    # Return padding and output shape.
+    return (pad_left, pad_right, pad_top, pad_bottom), output_shape
+
+
+@DeveloperAPI
+def valid_padding(
+    in_size: Tuple[int, int],
+    filter_size: Union[int, Tuple[int, int]],
+    stride_size: Union[int, Tuple[int, int]],
+) -> Tuple[int, int]:
+    """Emulates TF Conv2DLayer "valid" padding (no padding) and computes output dims.
+
+    This method, analogous to its "same" counterpart, but it only computes the output
+    image size, since valid padding means (0, 0, 0, 0).
+
+    See www.tensorflow.org/versions/r0.12/api_docs/python/nn/convolution
+
+    Args:
+        in_size: Rows (Height), Column (Width) for input
+        stride_size (Union[int,Tuple[int, int]]): Rows (Height), column (Width)
+            for stride. If int, height == width.
+        filter_size: Rows (Height), column (Width) for filter
+
+    Returns:
+        The output shape after padding and convolution.
+    """
+    in_height, in_width = in_size
+    if isinstance(filter_size, int):
+        filter_height, filter_width = filter_size, filter_size
+    else:
+        filter_height, filter_width = filter_size
+    if isinstance(stride_size, (int, float)):
+        stride_height, stride_width = int(stride_size), int(stride_size)
+    else:
+        stride_height, stride_width = int(stride_size[0]), int(stride_size[1])
+
+    out_height = int(np.ceil((in_height - filter_height + 1) / float(stride_height)))
+    out_width = int(np.ceil((in_width - filter_width + 1) / float(stride_width)))
+
+    return (out_height, out_width)
+
+
+@DeveloperAPI
+class SlimConv2d(nn.Module):
+    """Simple mock of tf.slim Conv2d"""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]],
+        padding: Union[int, Tuple[int, int]],
+        # Defaulting these to nn.[..] will break soft torch import.
+        initializer: Any = "default",
+        activation_fn: Any = "default",
+        bias_init: float = 0,
+    ):
+        """Creates a standard Conv2d layer, similar to torch.nn.Conv2d
+
+        Args:
+            in_channels: Number of input channels
+            out_channels: Number of output channels
+            kernel: If int, the kernel is
+                a tuple(x,x). Elsewise, the tuple can be specified
+            stride: Controls the stride
+                for the cross-correlation. If int, the stride is a
+                tuple(x,x). Elsewise, the tuple can be specified
+            padding: Controls the amount
+                of implicit zero-paddings during the conv operation
+            initializer: Initializer function for kernel weights
+            activation_fn: Activation function at the end of layer
+            bias_init: Initalize bias weights to bias_init const
+        """
+        super(SlimConv2d, self).__init__()
+        layers = []
+        # Padding layer.
+        if padding:
+            layers.append(nn.ZeroPad2d(padding))
+        # Actual Conv2D layer (including correct initialization logic).
+        conv = nn.Conv2d(in_channels, out_channels, kernel, stride)
+        if initializer:
+            if initializer == "default":
+                initializer = nn.init.xavier_uniform_
+            initializer(conv.weight)
+        nn.init.constant_(conv.bias, bias_init)
+        layers.append(conv)
+        # Activation function (if any; default=ReLu).
+        if isinstance(activation_fn, str):
+            if activation_fn == "default":
+                activation_fn = nn.ReLU
+            else:
+                activation_fn = get_activation_fn(activation_fn, "torch")
+        if activation_fn is not None:
+            layers.append(activation_fn())
+        # Put everything in sequence.
+        self._model = nn.Sequential(*layers)
+
+    def forward(self, x: TensorType) -> TensorType:
+        return self._model(x)
+
+
+@DeveloperAPI
+class SlimFC(nn.Module):
+    """Simple PyTorch version of `linear` function"""
+
+    def __init__(
+        self,
+        in_size: int,
+        out_size: int,
+        initializer: Any = None,
+        activation_fn: Any = None,
+        use_bias: bool = True,
+        bias_init: float = 0.0,
+    ):
+        """Creates a standard FC layer, similar to torch.nn.Linear
+
+        Args:
+            in_size: Input size for FC Layer
+            out_size: Output size for FC Layer
+            initializer: Initializer function for FC layer weights
+            activation_fn: Activation function at the end of layer
+            use_bias: Whether to add bias weights or not
+            bias_init: Initalize bias weights to bias_init const
+        """
+        super(SlimFC, self).__init__()
+        layers = []
+        # Actual nn.Linear layer (including correct initialization logic).
+        linear = nn.Linear(in_size, out_size, bias=use_bias)
+        if initializer is None:
+            initializer = nn.init.xavier_uniform_
+        initializer(linear.weight)
+        if use_bias is True:
+            nn.init.constant_(linear.bias, bias_init)
+        layers.append(linear)
+        # Activation function (if any; default=None (linear)).
+        if isinstance(activation_fn, str):
+            activation_fn = get_activation_fn(activation_fn, "torch")
+        if activation_fn is not None:
+            layers.append(activation_fn())
+        # Put everything in sequence.
+        self._model = nn.Sequential(*layers)
+
+    def forward(self, x: TensorType) -> TensorType:
+        return self._model(x)
+
+
+@DeveloperAPI
+class AppendBiasLayer(nn.Module):
+    """Simple bias appending layer for free_log_std."""
+
+    def __init__(self, num_bias_vars: int):
+        super().__init__()
+        self.log_std = torch.nn.Parameter(torch.as_tensor([0.0] * num_bias_vars))
+        self.register_parameter("log_std", self.log_std)
+
+    def forward(self, x: TensorType) -> TensorType:
+        out = torch.cat([x, self.log_std.unsqueeze(0).repeat([len(x), 1])], axis=1)
+        return out
+
+
+@DeveloperAPI
+class Reshape(nn.Module):
+    """Standard module that reshapes/views a tensor"""
+
+    def __init__(self, shape: List):
+        super().__init__()
+        self.shape = shape
+
+    def forward(self, x):
+        return x.view(*self.shape)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/gru_gate.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/gru_gate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bb6eee845423898c7a553667bc18159cd9f2ff2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/gru_gate.py
@@ -0,0 +1,67 @@
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class GRUGate(nn.Module):
+    """Implements a gated recurrent unit for use in AttentionNet"""
+
+    def __init__(self, dim: int, init_bias: int = 0.0, **kwargs):
+        """
+        input_shape (torch.Tensor): dimension of the input
+        init_bias: Bias added to every input to stabilize training
+        """
+        super().__init__(**kwargs)
+        # Xavier initialization of torch tensors
+        self._w_r = nn.Parameter(torch.zeros(dim, dim))
+        self._w_z = nn.Parameter(torch.zeros(dim, dim))
+        self._w_h = nn.Parameter(torch.zeros(dim, dim))
+        nn.init.xavier_uniform_(self._w_r)
+        nn.init.xavier_uniform_(self._w_z)
+        nn.init.xavier_uniform_(self._w_h)
+        self.register_parameter("_w_r", self._w_r)
+        self.register_parameter("_w_z", self._w_z)
+        self.register_parameter("_w_h", self._w_h)
+
+        self._u_r = nn.Parameter(torch.zeros(dim, dim))
+        self._u_z = nn.Parameter(torch.zeros(dim, dim))
+        self._u_h = nn.Parameter(torch.zeros(dim, dim))
+        nn.init.xavier_uniform_(self._u_r)
+        nn.init.xavier_uniform_(self._u_z)
+        nn.init.xavier_uniform_(self._u_h)
+        self.register_parameter("_u_r", self._u_r)
+        self.register_parameter("_u_z", self._u_z)
+        self.register_parameter("_u_h", self._u_h)
+
+        self._bias_z = nn.Parameter(
+            torch.zeros(
+                dim,
+            ).fill_(init_bias)
+        )
+        self.register_parameter("_bias_z", self._bias_z)
+
+    def forward(self, inputs: TensorType, **kwargs) -> TensorType:
+        # Pass in internal state first.
+        h, X = inputs
+
+        r = torch.tensordot(X, self._w_r, dims=1) + torch.tensordot(
+            h, self._u_r, dims=1
+        )
+        r = torch.sigmoid(r)
+
+        z = (
+            torch.tensordot(X, self._w_z, dims=1)
+            + torch.tensordot(h, self._u_z, dims=1)
+            - self._bias_z
+        )
+        z = torch.sigmoid(z)
+
+        h_next = torch.tensordot(X, self._w_h, dims=1) + torch.tensordot(
+            (h * r), self._u_h, dims=1
+        )
+        h_next = torch.tanh(h_next)
+
+        return (1 - z) * h + z * h_next
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/multi_head_attention.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/multi_head_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf4dfb50b2648b3ccd133a6b7be2e26839052969
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/multi_head_attention.py
@@ -0,0 +1,70 @@
+"""
+[1] - Attention Is All You Need - Vaswani, Jones, Shazeer, Parmar,
+      Uszkoreit, Gomez, Kaiser - Google Brain/Research, U Toronto - 2017.
+      https://arxiv.org/pdf/1706.03762.pdf
+"""
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.torch_utils import sequence_mask
+from ray.rllib.utils.framework import TensorType
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class MultiHeadAttention(nn.Module):
+    """A multi-head attention layer described in [1]."""
+
+    def __init__(
+        self, in_dim: int, out_dim: int, num_heads: int, head_dim: int, **kwargs
+    ):
+        """
+        in_dim: Dimension of input
+        out_dim: Dimension of output
+        num_heads: Number of attention heads
+        head_dim: Output dimension of each attention head
+        """
+        super().__init__(**kwargs)
+
+        # No bias or non-linearity.
+        self._num_heads = num_heads
+        self._head_dim = head_dim
+        self._qkv_layer = SlimFC(
+            in_size=in_dim, out_size=3 * num_heads * head_dim, use_bias=False
+        )
+
+        self._linear_layer = SlimFC(
+            in_size=num_heads * head_dim, out_size=out_dim, use_bias=False
+        )
+
+    def forward(self, inputs: TensorType) -> TensorType:
+        L = list(inputs.size())[1]  # length of segment
+        H = self._num_heads  # number of attention heads
+        D = self._head_dim  # attention head dimension
+
+        qkv = self._qkv_layer(inputs)
+
+        queries, keys, values = torch.chunk(input=qkv, chunks=3, dim=-1)
+        queries = queries[:, -L:]  # only query based on the segment
+
+        queries = torch.reshape(queries, [-1, L, H, D])
+        keys = torch.reshape(keys, [-1, L, H, D])
+        values = torch.reshape(values, [-1, L, H, D])
+
+        score = torch.einsum("bihd,bjhd->bijh", queries, keys)
+        score = score / D**0.5
+
+        # causal mask of the same length as the sequence
+        mask = sequence_mask(torch.arange(1, L + 1), dtype=score.dtype)
+        mask = mask[None, :, :, None]
+        mask = mask.float()
+
+        masked_score = score * mask + 1e30 * (mask - 1.0)
+        wmat = nn.functional.softmax(masked_score, dim=2)
+
+        out = torch.einsum("bijh,bjhd->bihd", wmat, values)
+        shape = list(out.size())[:2] + [H * D]
+        #        temp = torch.cat(temp2, [H * D], dim=0)
+        out = torch.reshape(out, shape)
+        return self._linear_layer(out)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/noisy_layer.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/noisy_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a9fe999cf79baeaa35af895311fbe8ecacdd302
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/noisy_layer.py
@@ -0,0 +1,99 @@
+import numpy as np
+
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.framework import try_import_torch, TensorType
+
+torch, nn = try_import_torch()
+
+
+class NoisyLayer(nn.Module):
+    r"""A Layer that adds learnable Noise to some previous layer's outputs.
+
+    Consists of:
+    - a common dense layer: y = w^{T}x + b
+    - a noisy layer: y = (w + \epsilon_w*\sigma_w)^{T}x +
+        (b+\epsilon_b*\sigma_b)
+    , where \epsilon are random variables sampled from factorized normal
+    distributions and \sigma are trainable variables which are expected to
+    vanish along the training procedure.
+    """
+
+    def __init__(
+        self, in_size: int, out_size: int, sigma0: float, activation: str = "relu"
+    ):
+        """Initializes a NoisyLayer object.
+
+        Args:
+            in_size: Input size for Noisy Layer
+            out_size: Output size for Noisy Layer
+            sigma0: Initialization value for sigma_b (bias noise)
+            activation: Non-linear activation for Noisy Layer
+        """
+        super().__init__()
+
+        self.in_size = in_size
+        self.out_size = out_size
+        self.sigma0 = sigma0
+        self.activation = get_activation_fn(activation, framework="torch")
+        if self.activation is not None:
+            self.activation = self.activation()
+
+        sigma_w = nn.Parameter(
+            torch.from_numpy(
+                np.random.uniform(
+                    low=-1.0 / np.sqrt(float(self.in_size)),
+                    high=1.0 / np.sqrt(float(self.in_size)),
+                    size=[self.in_size, out_size],
+                )
+            ).float()
+        )
+        self.register_parameter("sigma_w", sigma_w)
+        sigma_b = nn.Parameter(
+            torch.from_numpy(
+                np.full(
+                    shape=[out_size], fill_value=sigma0 / np.sqrt(float(self.in_size))
+                )
+            ).float()
+        )
+        self.register_parameter("sigma_b", sigma_b)
+
+        w = nn.Parameter(
+            torch.from_numpy(
+                np.full(
+                    shape=[self.in_size, self.out_size],
+                    fill_value=6 / np.sqrt(float(in_size) + float(out_size)),
+                )
+            ).float()
+        )
+        self.register_parameter("w", w)
+        b = nn.Parameter(torch.from_numpy(np.zeros([out_size])).float())
+        self.register_parameter("b", b)
+
+    def forward(self, inputs: TensorType) -> TensorType:
+        epsilon_in = self._f_epsilon(
+            torch.normal(
+                mean=torch.zeros([self.in_size]), std=torch.ones([self.in_size])
+            ).to(inputs.device)
+        )
+        epsilon_out = self._f_epsilon(
+            torch.normal(
+                mean=torch.zeros([self.out_size]), std=torch.ones([self.out_size])
+            ).to(inputs.device)
+        )
+        epsilon_w = torch.matmul(
+            torch.unsqueeze(epsilon_in, -1), other=torch.unsqueeze(epsilon_out, 0)
+        )
+        epsilon_b = epsilon_out
+
+        action_activation = (
+            torch.matmul(inputs, self.w + self.sigma_w * epsilon_w)
+            + self.b
+            + self.sigma_b * epsilon_b
+        )
+
+        if self.activation is not None:
+            action_activation = self.activation(action_activation)
+        return action_activation
+
+    def _f_epsilon(self, x: TensorType) -> TensorType:
+        return torch.sign(x) * torch.pow(torch.abs(x), 0.5)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/relative_multi_head_attention.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/relative_multi_head_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa19207e024d7d3f4a365e455974b4127b279606
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/modules/relative_multi_head_attention.py
@@ -0,0 +1,177 @@
+from typing import Union
+
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.torch_utils import sequence_mask
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class RelativePositionEmbedding(nn.Module):
+    """Creates a [seq_length x seq_length] matrix for rel. pos encoding.
+
+    Denoted as Phi in [2] and [3]. Phi is the standard sinusoid encoding
+    matrix.
+
+    Args:
+        seq_length: The max. sequence length (time axis).
+        out_dim: The number of nodes to go into the first Tranformer
+            layer with.
+
+    Returns:
+        torch.Tensor: The encoding matrix Phi.
+    """
+
+    def __init__(self, out_dim, **kwargs):
+        super().__init__()
+        self.out_dim = out_dim
+
+        out_range = torch.arange(0, self.out_dim, 2.0)
+        inverse_freq = 1 / (10000 ** (out_range / self.out_dim))
+        self.register_buffer("inverse_freq", inverse_freq)
+
+    def forward(self, seq_length):
+        pos_input = torch.arange(seq_length - 1, -1, -1.0, dtype=torch.float).to(
+            self.inverse_freq.device
+        )
+        sinusoid_input = torch.einsum("i,j->ij", pos_input, self.inverse_freq)
+        pos_embeddings = torch.cat(
+            [torch.sin(sinusoid_input), torch.cos(sinusoid_input)], dim=-1
+        )
+        return pos_embeddings[:, None, :]
+
+
+class RelativeMultiHeadAttention(nn.Module):
+    """A RelativeMultiHeadAttention layer as described in [3].
+
+    Uses segment level recurrence with state reuse.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_heads: int,
+        head_dim: int,
+        input_layernorm: bool = False,
+        output_activation: Union[str, callable] = None,
+        **kwargs
+    ):
+        """Initializes a RelativeMultiHeadAttention nn.Module object.
+
+        Args:
+            in_dim (int):
+            out_dim: The output dimension of this module. Also known as
+                "attention dim".
+            num_heads: The number of attention heads to use.
+                Denoted `H` in [2].
+            head_dim: The dimension of a single(!) attention head
+                Denoted `D` in [2].
+            input_layernorm: Whether to prepend a LayerNorm before
+                everything else. Should be True for building a GTrXL.
+            output_activation (Union[str, callable]): Optional activation
+                function or activation function specifier (str).
+                Should be "relu" for GTrXL.
+            **kwargs:
+        """
+        super().__init__(**kwargs)
+
+        # No bias or non-linearity.
+        self._num_heads = num_heads
+        self._head_dim = head_dim
+
+        # 3=Query, key, and value inputs.
+        self._qkv_layer = SlimFC(
+            in_size=in_dim, out_size=3 * num_heads * head_dim, use_bias=False
+        )
+
+        self._linear_layer = SlimFC(
+            in_size=num_heads * head_dim,
+            out_size=out_dim,
+            use_bias=False,
+            activation_fn=output_activation,
+        )
+
+        self._uvar = nn.Parameter(torch.zeros(num_heads, head_dim))
+        self._vvar = nn.Parameter(torch.zeros(num_heads, head_dim))
+        nn.init.xavier_uniform_(self._uvar)
+        nn.init.xavier_uniform_(self._vvar)
+        self.register_parameter("_uvar", self._uvar)
+        self.register_parameter("_vvar", self._vvar)
+
+        self._pos_proj = SlimFC(
+            in_size=in_dim, out_size=num_heads * head_dim, use_bias=False
+        )
+        self._rel_pos_embedding = RelativePositionEmbedding(out_dim)
+
+        self._input_layernorm = None
+        if input_layernorm:
+            self._input_layernorm = torch.nn.LayerNorm(in_dim)
+
+    def forward(self, inputs: TensorType, memory: TensorType = None) -> TensorType:
+        T = list(inputs.size())[1]  # length of segment (time)
+        H = self._num_heads  # number of attention heads
+        d = self._head_dim  # attention head dimension
+
+        # Add previous memory chunk (as const, w/o gradient) to input.
+        # Tau (number of (prev) time slices in each memory chunk).
+        Tau = list(memory.shape)[1]
+        inputs = torch.cat((memory.detach(), inputs), dim=1)
+
+        # Apply the Layer-Norm.
+        if self._input_layernorm is not None:
+            inputs = self._input_layernorm(inputs)
+
+        qkv = self._qkv_layer(inputs)
+
+        queries, keys, values = torch.chunk(input=qkv, chunks=3, dim=-1)
+        # Cut out Tau memory timesteps from query.
+        queries = queries[:, -T:]
+
+        queries = torch.reshape(queries, [-1, T, H, d])
+        keys = torch.reshape(keys, [-1, Tau + T, H, d])
+        values = torch.reshape(values, [-1, Tau + T, H, d])
+
+        R = self._pos_proj(self._rel_pos_embedding(Tau + T))
+        R = torch.reshape(R, [Tau + T, H, d])
+
+        # b=batch
+        # i and j=time indices (i=max-timesteps (inputs); j=Tau memory space)
+        # h=head
+        # d=head-dim (over which we will reduce-sum)
+        score = torch.einsum("bihd,bjhd->bijh", queries + self._uvar, keys)
+        pos_score = torch.einsum("bihd,jhd->bijh", queries + self._vvar, R)
+        score = score + self.rel_shift(pos_score)
+        score = score / d**0.5
+
+        # causal mask of the same length as the sequence
+        mask = sequence_mask(torch.arange(Tau + 1, Tau + T + 1), dtype=score.dtype).to(
+            score.device
+        )
+        mask = mask[None, :, :, None]
+
+        masked_score = score * mask + 1e30 * (mask.float() - 1.0)
+        wmat = nn.functional.softmax(masked_score, dim=2)
+
+        out = torch.einsum("bijh,bjhd->bihd", wmat, values)
+        shape = list(out.shape)[:2] + [H * d]
+        out = torch.reshape(out, shape)
+
+        return self._linear_layer(out)
+
+    @staticmethod
+    def rel_shift(x: TensorType) -> TensorType:
+        # Transposed version of the shift approach described in [3].
+        # https://github.com/kimiyoung/transformer-xl/blob/
+        # 44781ed21dbaec88b280f74d9ae2877f52b492a5/tf/model.py#L31
+        x_size = list(x.shape)
+
+        x = torch.nn.functional.pad(x, (0, 0, 1, 0, 0, 0, 0, 0))
+        x = torch.reshape(x, [x_size[0], x_size[2] + 1, x_size[1], x_size[3]])
+        x = x[:, 1:, :, :]
+        x = torch.reshape(x, x_size)
+
+        return x
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/recurrent_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/recurrent_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..01fbab223e29fe721914bab0e15860cb91f760d4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/recurrent_net.py
@@ -0,0 +1,295 @@
+import numpy as np
+import gymnasium as gym
+from gymnasium.spaces import Discrete, MultiDiscrete
+import tree  # pip install dm_tree
+from typing import Dict, List, Union, Tuple
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.policy.rnn_sequencing import add_time_dimension
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.torch_utils import flatten_inputs_to_1d_tensor, one_hot
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util.debug import log_once
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class RecurrentNetwork(TorchModelV2):
+    """Helper class to simplify implementing RNN models with TorchModelV2.
+
+    Instead of implementing forward(), you can implement forward_rnn() which
+    takes batches with the time dimension added already.
+
+    Here is an example implementation for a subclass
+    ``MyRNNClass(RecurrentNetwork, nn.Module)``::
+
+        def __init__(self, obs_space, num_outputs):
+            nn.Module.__init__(self)
+            super().__init__(obs_space, action_space, num_outputs,
+                             model_config, name)
+            self.obs_size = _get_size(obs_space)
+            self.rnn_hidden_dim = model_config["lstm_cell_size"]
+            self.fc1 = nn.Linear(self.obs_size, self.rnn_hidden_dim)
+            self.rnn = nn.GRUCell(self.rnn_hidden_dim, self.rnn_hidden_dim)
+            self.fc2 = nn.Linear(self.rnn_hidden_dim, num_outputs)
+
+            self.value_branch = nn.Linear(self.rnn_hidden_dim, 1)
+            self._cur_value = None
+
+        @override(ModelV2)
+        def get_initial_state(self):
+            # Place hidden states on same device as model.
+            h = [self.fc1.weight.new(
+                1, self.rnn_hidden_dim).zero_().squeeze(0)]
+            return h
+
+        @override(ModelV2)
+        def value_function(self):
+            assert self._cur_value is not None, "must call forward() first"
+            return self._cur_value
+
+        @override(RecurrentNetwork)
+        def forward_rnn(self, input_dict, state, seq_lens):
+            x = nn.functional.relu(self.fc1(input_dict["obs_flat"].float()))
+            h_in = state[0].reshape(-1, self.rnn_hidden_dim)
+            h = self.rnn(x, h_in)
+            q = self.fc2(h)
+            self._cur_value = self.value_branch(h).squeeze(1)
+            return q, [h]
+    """
+
+    @override(ModelV2)
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> Tuple[TensorType, List[TensorType]]:
+        """Adds time dimension to batch before sending inputs to forward_rnn().
+
+        You should implement forward_rnn() in your subclass."""
+        # Creating a __init__ function that acts as a passthrough and adding the warning
+        # there led to errors probably due to the multiple inheritance. We encountered
+        # the same error if we add the Deprecated decorator. We therefore add the
+        # deprecation warning here.
+        if log_once("recurrent_network_tf"):
+            deprecation_warning(
+                old="ray.rllib.models.torch.recurrent_net.RecurrentNetwork"
+            )
+        flat_inputs = input_dict["obs_flat"].float()
+        # Note that max_seq_len != input_dict.max_seq_len != seq_lens.max()
+        # as input_dict may have extra zero-padding beyond seq_lens.max().
+        # Use add_time_dimension to handle this
+        self.time_major = self.model_config.get("_time_major", False)
+        inputs = add_time_dimension(
+            flat_inputs,
+            seq_lens=seq_lens,
+            framework="torch",
+            time_major=self.time_major,
+        )
+        output, new_state = self.forward_rnn(inputs, state, seq_lens)
+        output = torch.reshape(output, [-1, self.num_outputs])
+        return output, new_state
+
+    def forward_rnn(
+        self, inputs: TensorType, state: List[TensorType], seq_lens: TensorType
+    ) -> Tuple[TensorType, List[TensorType]]:
+        """Call the model with the given input tensors and state.
+
+        Args:
+            inputs: Observation tensor with shape [B, T, obs_size].
+            state: List of state tensors, each with shape [B, size].
+            seq_lens: 1D tensor holding input sequence lengths.
+                Note: len(seq_lens) == B.
+
+        Returns:
+            (outputs, new_state): The model output tensor of shape
+                [B, T, num_outputs] and the list of new state tensors each with
+                shape [B, size].
+
+        Examples:
+            def forward_rnn(self, inputs, state, seq_lens):
+                model_out, h, c = self.rnn_model([inputs, seq_lens] + state)
+                return model_out, [h, c]
+        """
+        raise NotImplementedError("You must implement this for an RNN model")
+
+
+@OldAPIStack
+class LSTMWrapper(RecurrentNetwork, nn.Module):
+    """An LSTM wrapper serving as an interface for ModelV2s that set use_lstm."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        nn.Module.__init__(self)
+        super(LSTMWrapper, self).__init__(
+            obs_space, action_space, None, model_config, name
+        )
+
+        # At this point, self.num_outputs is the number of nodes coming
+        # from the wrapped (underlying) model. In other words, self.num_outputs
+        # is the input size for the LSTM layer.
+        # If None, set it to the observation space.
+        if self.num_outputs is None:
+            self.num_outputs = int(np.prod(self.obs_space.shape))
+
+        self.cell_size = model_config["lstm_cell_size"]
+        self.time_major = model_config.get("_time_major", False)
+        self.use_prev_action = model_config["lstm_use_prev_action"]
+        self.use_prev_reward = model_config["lstm_use_prev_reward"]
+
+        self.action_space_struct = get_base_struct_from_space(self.action_space)
+        self.action_dim = 0
+
+        for space in tree.flatten(self.action_space_struct):
+            if isinstance(space, Discrete):
+                self.action_dim += space.n
+            elif isinstance(space, MultiDiscrete):
+                self.action_dim += np.sum(space.nvec)
+            elif space.shape is not None:
+                self.action_dim += int(np.prod(space.shape))
+            else:
+                self.action_dim += int(len(space))
+
+        # Add prev-action/reward nodes to input to LSTM.
+        if self.use_prev_action:
+            self.num_outputs += self.action_dim
+        if self.use_prev_reward:
+            self.num_outputs += 1
+
+        # Define actual LSTM layer (with num_outputs being the nodes coming
+        # from the wrapped (underlying) layer).
+        self.lstm = nn.LSTM(
+            self.num_outputs, self.cell_size, batch_first=not self.time_major
+        )
+
+        # Set self.num_outputs to the number of output nodes desired by the
+        # caller of this constructor.
+        self.num_outputs = num_outputs
+
+        # Postprocess LSTM output with another hidden layer and compute values.
+        self._logits_branch = SlimFC(
+            in_size=self.cell_size,
+            out_size=self.num_outputs,
+            activation_fn=None,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+        self._value_branch = SlimFC(
+            in_size=self.cell_size,
+            out_size=1,
+            activation_fn=None,
+            initializer=torch.nn.init.xavier_uniform_,
+        )
+
+        # __sphinx_doc_begin__
+        # Add prev-a/r to this model's view, if required.
+        if model_config["lstm_use_prev_action"]:
+            self.view_requirements[SampleBatch.PREV_ACTIONS] = ViewRequirement(
+                SampleBatch.ACTIONS, space=self.action_space, shift=-1
+            )
+        if model_config["lstm_use_prev_reward"]:
+            self.view_requirements[SampleBatch.PREV_REWARDS] = ViewRequirement(
+                SampleBatch.REWARDS, shift=-1
+            )
+        # __sphinx_doc_end__
+
+    @override(RecurrentNetwork)
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> Tuple[TensorType, List[TensorType]]:
+        assert seq_lens is not None
+        # Push obs through "unwrapped" net's `forward()` first.
+        wrapped_out, _ = self._wrapped_forward(input_dict, [], None)
+
+        # Concat. prev-action/reward if required.
+        prev_a_r = []
+
+        # Prev actions.
+        if self.model_config["lstm_use_prev_action"]:
+            prev_a = input_dict[SampleBatch.PREV_ACTIONS]
+            # If actions are not processed yet (in their original form as
+            # have been sent to environment):
+            # Flatten/one-hot into 1D array.
+            if self.model_config["_disable_action_flattening"]:
+                prev_a_r.append(
+                    flatten_inputs_to_1d_tensor(
+                        prev_a, spaces_struct=self.action_space_struct, time_axis=False
+                    )
+                )
+            # If actions are already flattened (but not one-hot'd yet!),
+            # one-hot discrete/multi-discrete actions here.
+            else:
+                if isinstance(self.action_space, (Discrete, MultiDiscrete)):
+                    prev_a = one_hot(prev_a.float(), self.action_space)
+                else:
+                    prev_a = prev_a.float()
+                prev_a_r.append(torch.reshape(prev_a, [-1, self.action_dim]))
+        # Prev rewards.
+        if self.model_config["lstm_use_prev_reward"]:
+            prev_a_r.append(
+                torch.reshape(input_dict[SampleBatch.PREV_REWARDS].float(), [-1, 1])
+            )
+
+        # Concat prev. actions + rewards to the "main" input.
+        if prev_a_r:
+            wrapped_out = torch.cat([wrapped_out] + prev_a_r, dim=1)
+
+        # Push everything through our LSTM.
+        input_dict["obs_flat"] = wrapped_out
+        return super().forward(input_dict, state, seq_lens)
+
+    @override(RecurrentNetwork)
+    def forward_rnn(
+        self, inputs: TensorType, state: List[TensorType], seq_lens: TensorType
+    ) -> Tuple[TensorType, List[TensorType]]:
+        # Don't show paddings to RNN(?)
+        # TODO: (sven) For now, only allow, iff time_major=True to not break
+        #  anything retrospectively (time_major not supported previously).
+        # max_seq_len = inputs.shape[0]
+        # time_major = self.model_config["_time_major"]
+        # if time_major and max_seq_len > 1:
+        #     inputs = torch.nn.utils.rnn.pack_padded_sequence(
+        #         inputs, seq_lens,
+        #         batch_first=not time_major, enforce_sorted=False)
+        self._features, [h, c] = self.lstm(
+            inputs, [torch.unsqueeze(state[0], 0), torch.unsqueeze(state[1], 0)]
+        )
+        # Re-apply paddings.
+        # if time_major and max_seq_len > 1:
+        #     self._features, _ = torch.nn.utils.rnn.pad_packed_sequence(
+        #         self._features,
+        #         batch_first=not time_major)
+        model_out = self._logits_branch(self._features)
+        return model_out, [torch.squeeze(h, 0), torch.squeeze(c, 0)]
+
+    @override(ModelV2)
+    def get_initial_state(self) -> Union[List[np.ndarray], List[TensorType]]:
+        # Place hidden states on same device as model.
+        linear = next(self._logits_branch._model.children())
+        h = [
+            linear.weight.new(1, self.cell_size).zero_().squeeze(0),
+            linear.weight.new(1, self.cell_size).zero_().squeeze(0),
+        ]
+        return h
+
+    @override(ModelV2)
+    def value_function(self) -> TensorType:
+        assert self._features is not None, "must call forward() first"
+        return torch.reshape(self._value_branch(self._features), [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_action_dist.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_action_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..91c69180070e9797d62f39939db03340979ee2d6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_action_dist.py
@@ -0,0 +1,644 @@
+import functools
+import gymnasium as gym
+from math import log
+import numpy as np
+import tree  # pip install dm_tree
+from typing import Optional
+
+from ray.rllib.models.action_dist import ActionDistribution
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import SMALL_NUMBER, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.typing import TensorType, List, Union, Tuple, ModelConfigDict
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class TorchDistributionWrapper(ActionDistribution):
+    """Wrapper class for torch.distributions."""
+
+    @override(ActionDistribution)
+    def __init__(self, inputs: List[TensorType], model: TorchModelV2):
+        # If inputs are not a torch Tensor, make them one and make sure they
+        # are on the correct device.
+        if not isinstance(inputs, torch.Tensor):
+            inputs = torch.from_numpy(inputs)
+            if isinstance(model, TorchModelV2):
+                inputs = inputs.to(next(model.parameters()).device)
+        super().__init__(inputs, model)
+        # Store the last sample here.
+        self.last_sample = None
+
+    @override(ActionDistribution)
+    def logp(self, actions: TensorType) -> TensorType:
+        return self.dist.log_prob(actions)
+
+    @override(ActionDistribution)
+    def entropy(self) -> TensorType:
+        return self.dist.entropy()
+
+    @override(ActionDistribution)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        return torch.distributions.kl.kl_divergence(self.dist, other.dist)
+
+    @override(ActionDistribution)
+    def sample(self) -> TensorType:
+        self.last_sample = self.dist.sample()
+        return self.last_sample
+
+    @override(ActionDistribution)
+    def sampled_action_logp(self) -> TensorType:
+        assert self.last_sample is not None
+        return self.logp(self.last_sample)
+
+
+@OldAPIStack
+class TorchCategorical(TorchDistributionWrapper):
+    """Wrapper class for PyTorch Categorical distribution."""
+
+    @override(ActionDistribution)
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: TorchModelV2 = None,
+        temperature: float = 1.0,
+    ):
+        if temperature != 1.0:
+            assert temperature > 0.0, "Categorical `temperature` must be > 0.0!"
+            inputs /= temperature
+        super().__init__(inputs, model)
+        self.dist = torch.distributions.categorical.Categorical(logits=self.inputs)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        self.last_sample = self.dist.probs.argmax(dim=1)
+        return self.last_sample
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return action_space.n
+
+
+@OldAPIStack
+def get_torch_categorical_class_with_temperature(t: float):
+    """TorchCategorical distribution class that has customized default temperature."""
+
+    class TorchCategoricalWithTemperature(TorchCategorical):
+        def __init__(self, inputs, model=None, temperature=t):
+            super().__init__(inputs, model, temperature)
+
+    return TorchCategoricalWithTemperature
+
+
+@OldAPIStack
+class TorchMultiCategorical(TorchDistributionWrapper):
+    """MultiCategorical distribution for MultiDiscrete action spaces."""
+
+    @override(TorchDistributionWrapper)
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: TorchModelV2,
+        input_lens: Union[List[int], np.ndarray, Tuple[int, ...]],
+        action_space=None,
+    ):
+        super().__init__(inputs, model)
+        # If input_lens is np.ndarray or list, force-make it a tuple.
+        inputs_split = self.inputs.split(tuple(input_lens), dim=1)
+        self.cats = [
+            torch.distributions.categorical.Categorical(logits=input_)
+            for input_ in inputs_split
+        ]
+        # Used in case we are dealing with an Int Box.
+        self.action_space = action_space
+
+    @override(TorchDistributionWrapper)
+    def sample(self) -> TensorType:
+        arr = [cat.sample() for cat in self.cats]
+        sample_ = torch.stack(arr, dim=1)
+        if isinstance(self.action_space, gym.spaces.Box):
+            sample_ = torch.reshape(sample_, [-1] + list(self.action_space.shape))
+        self.last_sample = sample_
+        return sample_
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        arr = [torch.argmax(cat.probs, -1) for cat in self.cats]
+        sample_ = torch.stack(arr, dim=1)
+        if isinstance(self.action_space, gym.spaces.Box):
+            sample_ = torch.reshape(sample_, [-1] + list(self.action_space.shape))
+        self.last_sample = sample_
+        return sample_
+
+    @override(TorchDistributionWrapper)
+    def logp(self, actions: TensorType) -> TensorType:
+        # # If tensor is provided, unstack it into list.
+        if isinstance(actions, torch.Tensor):
+            if isinstance(self.action_space, gym.spaces.Box):
+                actions = torch.reshape(
+                    actions, [-1, int(np.prod(self.action_space.shape))]
+                )
+            actions = torch.unbind(actions, dim=1)
+        logps = torch.stack([cat.log_prob(act) for cat, act in zip(self.cats, actions)])
+        return torch.sum(logps, dim=0)
+
+    @override(ActionDistribution)
+    def multi_entropy(self) -> TensorType:
+        return torch.stack([cat.entropy() for cat in self.cats], dim=1)
+
+    @override(TorchDistributionWrapper)
+    def entropy(self) -> TensorType:
+        return torch.sum(self.multi_entropy(), dim=1)
+
+    @override(ActionDistribution)
+    def multi_kl(self, other: ActionDistribution) -> TensorType:
+        return torch.stack(
+            [
+                torch.distributions.kl.kl_divergence(cat, oth_cat)
+                for cat, oth_cat in zip(self.cats, other.cats)
+            ],
+            dim=1,
+        )
+
+    @override(TorchDistributionWrapper)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        return torch.sum(self.multi_kl(other), dim=1)
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        # Int Box.
+        if isinstance(action_space, gym.spaces.Box):
+            assert action_space.dtype.name.startswith("int")
+            low_ = np.min(action_space.low)
+            high_ = np.max(action_space.high)
+            assert np.all(action_space.low == low_)
+            assert np.all(action_space.high == high_)
+            return np.prod(action_space.shape, dtype=np.int32) * (high_ - low_ + 1)
+        # MultiDiscrete space.
+        else:
+            # `nvec` is already integer. No need to cast.
+            return np.sum(action_space.nvec)
+
+
+@OldAPIStack
+class TorchSlateMultiCategorical(TorchCategorical):
+    """MultiCategorical distribution for MultiDiscrete action spaces.
+
+    The action space must be uniform, meaning all nvec items have the same size, e.g.
+    MultiDiscrete([10, 10, 10]), where 10 is the number of candidates to pick from
+    and 3 is the slate size (pick 3 out of 10). When picking candidates, no candidate
+    must be picked more than once.
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: TorchModelV2 = None,
+        temperature: float = 1.0,
+        action_space: Optional[gym.spaces.MultiDiscrete] = None,
+        all_slates=None,
+    ):
+        assert temperature > 0.0, "Categorical `temperature` must be > 0.0!"
+        # Allow softmax formula w/ temperature != 1.0:
+        # Divide inputs by temperature.
+        super().__init__(inputs / temperature, model)
+        self.action_space = action_space
+        # Assert uniformness of the action space (all discrete buckets have the same
+        # size).
+        assert isinstance(self.action_space, gym.spaces.MultiDiscrete) and all(
+            n == self.action_space.nvec[0] for n in self.action_space.nvec
+        )
+        self.all_slates = all_slates
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        # Get a sample from the underlying Categorical (batch of ints).
+        sample = super().deterministic_sample()
+        # Use the sampled ints to pick the actual slates.
+        return torch.take_along_dim(self.all_slates, sample.long(), dim=-1)
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        # TODO: Implement.
+        return torch.ones_like(self.inputs[:, 0])
+
+
+@OldAPIStack
+class TorchDiagGaussian(TorchDistributionWrapper):
+    """Wrapper class for PyTorch Normal distribution."""
+
+    @override(ActionDistribution)
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: TorchModelV2,
+        *,
+        action_space: Optional[gym.spaces.Space] = None
+    ):
+        super().__init__(inputs, model)
+        mean, log_std = torch.chunk(self.inputs, 2, dim=1)
+        self.log_std = log_std
+        self.dist = torch.distributions.normal.Normal(mean, torch.exp(log_std))
+        # Remember to squeeze action samples in case action space is Box(shape)
+        self.zero_action_dim = action_space and action_space.shape == ()
+
+    @override(TorchDistributionWrapper)
+    def sample(self) -> TensorType:
+        sample = super().sample()
+        if self.zero_action_dim:
+            return torch.squeeze(sample, dim=-1)
+        return sample
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        self.last_sample = self.dist.mean
+        return self.last_sample
+
+    @override(TorchDistributionWrapper)
+    def logp(self, actions: TensorType) -> TensorType:
+        return super().logp(actions).sum(-1)
+
+    @override(TorchDistributionWrapper)
+    def entropy(self) -> TensorType:
+        return super().entropy().sum(-1)
+
+    @override(TorchDistributionWrapper)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        return super().kl(other).sum(-1)
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32) * 2
+
+
+@OldAPIStack
+class TorchSquashedGaussian(TorchDistributionWrapper):
+    """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
+
+    The distribution will never return low or high exactly, but
+    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: TorchModelV2,
+        low: float = -1.0,
+        high: float = 1.0,
+    ):
+        """Parameterizes the distribution via `inputs`.
+
+        Args:
+            low: The lowest possible sampling value
+                (excluding this value).
+            high: The highest possible sampling value
+                (excluding this value).
+        """
+        super().__init__(inputs, model)
+        # Split inputs into mean and log(std).
+        mean, log_std = torch.chunk(self.inputs, 2, dim=-1)
+        # Clip `scale` values (coming from NN) to reasonable values.
+        log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT)
+        std = torch.exp(log_std)
+        self.dist = torch.distributions.normal.Normal(mean, std)
+        assert np.all(np.less(low, high))
+        self.low = low
+        self.high = high
+        self.mean = mean
+        self.std = std
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        self.last_sample = self._squash(self.dist.mean)
+        return self.last_sample
+
+    @override(TorchDistributionWrapper)
+    def sample(self) -> TensorType:
+        # Use the reparameterization version of `dist.sample` to allow for
+        # the results to be backprop'able e.g. in a loss term.
+
+        normal_sample = self.dist.rsample()
+        self.last_sample = self._squash(normal_sample)
+        return self.last_sample
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        # Unsquash values (from [low,high] to ]-inf,inf[)
+        unsquashed_values = self._unsquash(x)
+        # Get log prob of unsquashed values from our Normal.
+        log_prob_gaussian = self.dist.log_prob(unsquashed_values)
+        # For safety reasons, clamp somehow, only then sum up.
+        log_prob_gaussian = torch.clamp(log_prob_gaussian, -100, 100)
+        log_prob_gaussian = torch.sum(log_prob_gaussian, dim=-1)
+        # Get log-prob for squashed Gaussian.
+        unsquashed_values_tanhd = torch.tanh(unsquashed_values)
+        log_prob = log_prob_gaussian - torch.sum(
+            torch.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), dim=-1
+        )
+        return log_prob
+
+    def sample_logp(self):
+        z = self.dist.rsample()
+        actions = self._squash(z)
+        return actions, torch.sum(
+            self.dist.log_prob(z) - torch.log(1 - actions * actions + SMALL_NUMBER),
+            dim=-1,
+        )
+
+    @override(TorchDistributionWrapper)
+    def entropy(self) -> TensorType:
+        raise ValueError("Entropy not defined for SquashedGaussian!")
+
+    @override(TorchDistributionWrapper)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        raise ValueError("KL not defined for SquashedGaussian!")
+
+    def _squash(self, raw_values: TensorType) -> TensorType:
+        # Returned values are within [low, high] (including `low` and `high`).
+        squashed = ((torch.tanh(raw_values) + 1.0) / 2.0) * (
+            self.high - self.low
+        ) + self.low
+        return torch.clamp(squashed, self.low, self.high)
+
+    def _unsquash(self, values: TensorType) -> TensorType:
+        normed_values = (values - self.low) / (self.high - self.low) * 2.0 - 1.0
+        # Stabilize input to atanh.
+        save_normed_values = torch.clamp(
+            normed_values, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER
+        )
+        unsquashed = torch.atanh(save_normed_values)
+        return unsquashed
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32) * 2
+
+
+@OldAPIStack
+class TorchBeta(TorchDistributionWrapper):
+    """
+    A Beta distribution is defined on the interval [0, 1] and parameterized by
+    shape parameters alpha and beta (also called concentration parameters).
+
+    PDF(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z
+        with Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta)
+        and Gamma(n) = (n - 1)!
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: TorchModelV2,
+        low: float = 0.0,
+        high: float = 1.0,
+    ):
+        super().__init__(inputs, model)
+        # Stabilize input parameters (possibly coming from a linear layer).
+        self.inputs = torch.clamp(self.inputs, log(SMALL_NUMBER), -log(SMALL_NUMBER))
+        self.inputs = torch.log(torch.exp(self.inputs) + 1.0) + 1.0
+        self.low = low
+        self.high = high
+        alpha, beta = torch.chunk(self.inputs, 2, dim=-1)
+        # Note: concentration0==beta, concentration1=alpha (!)
+        self.dist = torch.distributions.Beta(concentration1=alpha, concentration0=beta)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        self.last_sample = self._squash(self.dist.mean)
+        return self.last_sample
+
+    @override(TorchDistributionWrapper)
+    def sample(self) -> TensorType:
+        # Use the reparameterization version of `dist.sample` to allow for
+        # the results to be backprop'able e.g. in a loss term.
+        normal_sample = self.dist.rsample()
+        self.last_sample = self._squash(normal_sample)
+        return self.last_sample
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        unsquashed_values = self._unsquash(x)
+        return torch.sum(self.dist.log_prob(unsquashed_values), dim=-1)
+
+    def _squash(self, raw_values: TensorType) -> TensorType:
+        return raw_values * (self.high - self.low) + self.low
+
+    def _unsquash(self, values: TensorType) -> TensorType:
+        return (values - self.low) / (self.high - self.low)
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32) * 2
+
+
+@OldAPIStack
+class TorchDeterministic(TorchDistributionWrapper):
+    """Action distribution that returns the input values directly.
+
+    This is similar to DiagGaussian with standard deviation zero (thus only
+    requiring the "mean" values as NN output).
+    """
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        return self.inputs
+
+    @override(TorchDistributionWrapper)
+    def sampled_action_logp(self) -> TensorType:
+        return torch.zeros((self.inputs.size()[0],), dtype=torch.float32)
+
+    @override(TorchDistributionWrapper)
+    def sample(self) -> TensorType:
+        return self.deterministic_sample()
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32)
+
+
+@OldAPIStack
+class TorchMultiActionDistribution(TorchDistributionWrapper):
+    """Action distribution that operates on multiple, possibly nested actions."""
+
+    def __init__(self, inputs, model, *, child_distributions, input_lens, action_space):
+        """Initializes a TorchMultiActionDistribution object.
+
+        Args:
+            inputs (torch.Tensor): A single tensor of shape [BATCH, size].
+            model (TorchModelV2): The TorchModelV2 object used to produce
+                inputs for this distribution.
+            child_distributions (any[torch.Tensor]): Any struct
+                that contains the child distribution classes to use to
+                instantiate the child distributions from `inputs`. This could
+                be an already flattened list or a struct according to
+                `action_space`.
+            input_lens (any[int]): A flat list or a nested struct of input
+                split lengths used to split `inputs`.
+            action_space (Union[gym.spaces.Dict,gym.spaces.Tuple]): The complex
+                and possibly nested action space.
+        """
+        if not isinstance(inputs, torch.Tensor):
+            inputs = torch.from_numpy(inputs)
+            if isinstance(model, TorchModelV2):
+                inputs = inputs.to(next(model.parameters()).device)
+        super().__init__(inputs, model)
+
+        self.action_space_struct = get_base_struct_from_space(action_space)
+
+        self.input_lens = tree.flatten(input_lens)
+        flat_child_distributions = tree.flatten(child_distributions)
+        split_inputs = torch.split(inputs, self.input_lens, dim=1)
+        self.flat_child_distributions = tree.map_structure(
+            lambda dist, input_: dist(input_, model),
+            flat_child_distributions,
+            list(split_inputs),
+        )
+
+    @override(ActionDistribution)
+    def logp(self, x):
+        if isinstance(x, np.ndarray):
+            x = torch.Tensor(x)
+        # Single tensor input (all merged).
+        if isinstance(x, torch.Tensor):
+            split_indices = []
+            for dist in self.flat_child_distributions:
+                if isinstance(dist, TorchCategorical):
+                    split_indices.append(1)
+                elif (
+                    isinstance(dist, TorchMultiCategorical)
+                    and dist.action_space is not None
+                ):
+                    split_indices.append(int(np.prod(dist.action_space.shape)))
+                else:
+                    sample = dist.sample()
+                    # Cover Box(shape=()) case.
+                    if len(sample.shape) == 1:
+                        split_indices.append(1)
+                    else:
+                        split_indices.append(sample.size()[1])
+            split_x = list(torch.split(x, split_indices, dim=1))
+        # Structured or flattened (by single action component) input.
+        else:
+            split_x = tree.flatten(x)
+
+        def map_(val, dist):
+            # Remove extra categorical dimension.
+            if isinstance(dist, TorchCategorical):
+                val = (torch.squeeze(val, dim=-1) if len(val.shape) > 1 else val).int()
+            return dist.logp(val)
+
+        # Remove extra categorical dimension and take the logp of each
+        # component.
+        flat_logps = tree.map_structure(map_, split_x, self.flat_child_distributions)
+
+        return functools.reduce(lambda a, b: a + b, flat_logps)
+
+    @override(ActionDistribution)
+    def kl(self, other):
+        kl_list = [
+            d.kl(o)
+            for d, o in zip(
+                self.flat_child_distributions, other.flat_child_distributions
+            )
+        ]
+        return functools.reduce(lambda a, b: a + b, kl_list)
+
+    @override(ActionDistribution)
+    def entropy(self):
+        entropy_list = [d.entropy() for d in self.flat_child_distributions]
+        return functools.reduce(lambda a, b: a + b, entropy_list)
+
+    @override(ActionDistribution)
+    def sample(self):
+        child_distributions = tree.unflatten_as(
+            self.action_space_struct, self.flat_child_distributions
+        )
+        return tree.map_structure(lambda s: s.sample(), child_distributions)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self):
+        child_distributions = tree.unflatten_as(
+            self.action_space_struct, self.flat_child_distributions
+        )
+        return tree.map_structure(
+            lambda s: s.deterministic_sample(), child_distributions
+        )
+
+    @override(TorchDistributionWrapper)
+    def sampled_action_logp(self):
+        p = self.flat_child_distributions[0].sampled_action_logp()
+        for c in self.flat_child_distributions[1:]:
+            p += c.sampled_action_logp()
+        return p
+
+    @override(ActionDistribution)
+    def required_model_output_shape(self, action_space, model_config):
+        return np.sum(self.input_lens, dtype=np.int32)
+
+
+@OldAPIStack
+class TorchDirichlet(TorchDistributionWrapper):
+    """Dirichlet distribution for continuous actions that are between
+    [0,1] and sum to 1.
+
+    e.g. actions that represent resource allocation."""
+
+    def __init__(self, inputs, model):
+        """Input is a tensor of logits. The exponential of logits is used to
+        parametrize the Dirichlet distribution as all parameters need to be
+        positive. An arbitrary small epsilon is added to the concentration
+        parameters to be zero due to numerical error.
+
+        See issue #4440 for more details.
+        """
+        self.epsilon = torch.tensor(1e-7).to(inputs.device)
+        concentration = torch.exp(inputs) + self.epsilon
+        self.dist = torch.distributions.dirichlet.Dirichlet(
+            concentration=concentration,
+            validate_args=True,
+        )
+        super().__init__(concentration, model)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        self.last_sample = nn.functional.softmax(self.dist.concentration, dim=-1)
+        return self.last_sample
+
+    @override(ActionDistribution)
+    def logp(self, x):
+        # Support of Dirichlet are positive real numbers. x is already
+        # an array of positive numbers, but we clip to avoid zeros due to
+        # numerical errors.
+        x = torch.max(x, self.epsilon)
+        x = x / torch.sum(x, dim=-1, keepdim=True)
+        return self.dist.log_prob(x)
+
+    @override(ActionDistribution)
+    def entropy(self):
+        return self.dist.entropy()
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(action_space, model_config):
+        return np.prod(action_space.shape, dtype=np.int32)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_distributions.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2165f1bca65dae9c6edcfb459ca7e5c15402578
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_distributions.py
@@ -0,0 +1,682 @@
+"""The main difference between this and the old ActionDistribution is that this one
+has more explicit input args. So that the input format does not have to be guessed from
+the code. This matches the design pattern of torch distribution which developers may
+already be familiar with.
+"""
+import gymnasium as gym
+import numpy as np
+from typing import Dict, Iterable, List, Optional
+import tree
+import abc
+
+
+from ray.rllib.models.distributions import Distribution
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import MAX_LOG_NN_OUTPUT, MIN_LOG_NN_OUTPUT, SMALL_NUMBER
+from ray.rllib.utils.typing import TensorType, Union, Tuple
+
+torch, nn = try_import_torch()
+
+
+@DeveloperAPI
+class TorchDistribution(Distribution, abc.ABC):
+    """Wrapper class for torch.distributions."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self._dist = self._get_torch_distribution(*args, **kwargs)
+
+    @abc.abstractmethod
+    def _get_torch_distribution(
+        self, *args, **kwargs
+    ) -> "torch.distributions.Distribution":
+        """Returns the torch.distributions.Distribution object to use."""
+
+    @override(Distribution)
+    def logp(self, value: TensorType, **kwargs) -> TensorType:
+        return self._dist.log_prob(value, **kwargs)
+
+    @override(Distribution)
+    def entropy(self) -> TensorType:
+        return self._dist.entropy()
+
+    @override(Distribution)
+    def kl(self, other: "Distribution") -> TensorType:
+        return torch.distributions.kl.kl_divergence(self._dist, other._dist)
+
+    @override(Distribution)
+    def sample(
+        self,
+        *,
+        sample_shape=None,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        sample = self._dist.sample(
+            sample_shape if sample_shape is not None else torch.Size()
+        )
+        return sample
+
+    @override(Distribution)
+    def rsample(
+        self,
+        *,
+        sample_shape=None,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        rsample = self._dist.rsample(
+            sample_shape if sample_shape is not None else torch.Size()
+        )
+        return rsample
+
+
+@DeveloperAPI
+class TorchCategorical(TorchDistribution):
+    """Wrapper class for PyTorch Categorical distribution.
+
+    Creates a categorical distribution parameterized by either :attr:`probs` or
+    :attr:`logits` (but not both).
+
+    Samples are integers from :math:`\{0, \ldots, K-1\}` where `K` is
+    ``probs.size(-1)``.
+
+    If `probs` is 1-dimensional with length-`K`, each element is the relative
+    probability of sampling the class at that index.
+
+    If `probs` is N-dimensional, the first N-1 dimensions are treated as a batch of
+    relative probability vectors.
+
+    .. testcode::
+        :skipif: True
+
+        m = TorchCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        m.sample(sample_shape=(2,))  # equal probability of 0, 1, 2, 3
+
+    .. testoutput::
+
+        tensor([3, 4])
+
+    Args:
+        logits: Event log probabilities (unnormalized)
+        probs: The probablities of each event.
+        temperature: In case of using logits, this parameter can be used to determine
+            the sharpness of the distribution. i.e.
+            ``probs = softmax(logits / temperature)``. The temperature must be strictly
+            positive. A low value (e.g. 1e-10) will result in argmax sampling while a
+            larger value will result in uniform sampling.
+    """
+
+    @override(TorchDistribution)
+    def __init__(
+        self,
+        logits: "torch.Tensor" = None,
+        probs: "torch.Tensor" = None,
+    ) -> None:
+        # We assert this here because to_deterministic makes this assumption.
+        assert (probs is None) != (
+            logits is None
+        ), "Exactly one out of `probs` and `logits` must be set!"
+
+        self.probs = probs
+        self.logits = logits
+        super().__init__(logits=logits, probs=probs)
+
+        # Build this distribution only if really needed (in `self.rsample()`). It's
+        # quite expensive according to cProfile.
+        self._one_hot = None
+
+    @override(TorchDistribution)
+    def _get_torch_distribution(
+        self,
+        logits: "torch.Tensor" = None,
+        probs: "torch.Tensor" = None,
+    ) -> "torch.distributions.Distribution":
+        return torch.distributions.categorical.Categorical(logits=logits, probs=probs)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.Discrete)
+        return int(space.n)
+
+    @override(Distribution)
+    def rsample(self, sample_shape=()):
+        if self._one_hot is None:
+            self._one_hot = torch.distributions.one_hot_categorical.OneHotCategorical(
+                logits=self.logits, probs=self.probs
+            )
+        one_hot_sample = self._one_hot.sample(sample_shape)
+        return (one_hot_sample - self.probs).detach() + self.probs
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(cls, logits: TensorType, **kwargs) -> "TorchCategorical":
+        return TorchCategorical(logits=logits, **kwargs)
+
+    def to_deterministic(self) -> "TorchDeterministic":
+        if self.probs is not None:
+            probs_or_logits = self.probs
+        else:
+            probs_or_logits = self.logits
+
+        return TorchDeterministic(loc=torch.argmax(probs_or_logits, dim=-1))
+
+
+@DeveloperAPI
+class TorchDiagGaussian(TorchDistribution):
+    """Wrapper class for PyTorch Normal distribution.
+
+    Creates a normal distribution parameterized by :attr:`loc` and :attr:`scale`. In
+    case of multi-dimensional distribution, the variance is assumed to be diagonal.
+
+    .. testcode::
+        :skipif: True
+
+        loc, scale = torch.tensor([0.0, 0.0]), torch.tensor([1.0, 1.0])
+        m = TorchDiagGaussian(loc=loc, scale=scale)
+        m.sample(sample_shape=(2,))  # 2d normal dist with loc=0 and scale=1
+
+    .. testoutput::
+
+        tensor([[ 0.1046, -0.6120], [ 0.234, 0.556]])
+
+    .. testcode::
+        :skipif: True
+
+        # scale is None
+        m = TorchDiagGaussian(loc=torch.tensor([0.0, 1.0]))
+        m.sample(sample_shape=(2,))  # normally distributed with loc=0 and scale=1
+
+    .. testoutput::
+
+        tensor([0.1046, 0.6120])
+
+
+    Args:
+        loc: mean of the distribution (often referred to as mu). If scale is None, the
+            second half of the `loc` will be used as the log of scale.
+        scale: standard deviation of the distribution (often referred to as sigma).
+            Has to be positive.
+    """
+
+    @override(TorchDistribution)
+    def __init__(
+        self,
+        loc: Union[float, "torch.Tensor"],
+        scale: Optional[Union[float, "torch.Tensor"]],
+    ):
+        self.loc = loc
+        super().__init__(loc=loc, scale=scale)
+
+    def _get_torch_distribution(self, loc, scale) -> "torch.distributions.Distribution":
+        return torch.distributions.normal.Normal(loc, scale)
+
+    @override(TorchDistribution)
+    def logp(self, value: TensorType) -> TensorType:
+        return super().logp(value).sum(-1)
+
+    @override(TorchDistribution)
+    def entropy(self) -> TensorType:
+        return super().entropy().sum(-1)
+
+    @override(TorchDistribution)
+    def kl(self, other: "TorchDistribution") -> TensorType:
+        return super().kl(other).sum(-1)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.Box)
+        return int(np.prod(space.shape, dtype=np.int32) * 2)
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(cls, logits: TensorType, **kwargs) -> "TorchDiagGaussian":
+        loc, log_std = logits.chunk(2, dim=-1)
+        scale = log_std.exp()
+        return TorchDiagGaussian(loc=loc, scale=scale)
+
+    def to_deterministic(self) -> "TorchDeterministic":
+        return TorchDeterministic(loc=self.loc)
+
+
+@DeveloperAPI
+class TorchSquashedGaussian(TorchDistribution):
+    @override(TorchDistribution)
+    def __init__(
+        self,
+        loc: Union[float, "torch.Tensor"],
+        scale: Optional[Union[float, "torch.Tensor"]] = 1.0,
+        low: float = -1.0,
+        high: float = 1.0,
+    ):
+        self.loc = loc
+        self.low = low
+        self.high = high
+
+        super().__init__(loc=loc, scale=scale)
+
+    def _get_torch_distribution(self, loc, scale) -> "torch.distributions.Distribution":
+        return torch.distributions.normal.Normal(loc, scale)
+
+    @override(TorchDistribution)
+    def sample(
+        self, *, sample_shape=None
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        # Sample from the Normal distribution.
+        sample = super().sample(
+            sample_shape=sample_shape if sample_shape is not None else torch.Size()
+        )
+        # Return the squashed sample.
+        return self._squash(sample)
+
+    @override(TorchDistribution)
+    def rsample(
+        self, *, sample_shape=None
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        # Sample from the Normal distribution.
+        sample = super().rsample(
+            sample_shape=sample_shape if sample_shape is not None else torch.Size()
+        )
+        # Return the squashed sample.
+        return self._squash(sample)
+
+    @override(TorchDistribution)
+    def logp(self, value: TensorType, **kwargs) -> TensorType:
+        # Unsquash value.
+        value = self._unsquash(value)
+        # Get log-probabilities from Normal distribution.
+        logp = super().logp(value, **kwargs)
+        # Clip the log probabilities as a safeguard and sum.
+        logp = torch.clamp(logp, -100, 100).sum(-1)
+        # Return the log probabilities for squashed Normal.
+        value = torch.tanh(value)
+        return logp - torch.log(1 - value**2 + SMALL_NUMBER).sum(-1)
+
+    @override(TorchDistribution)
+    def entropy(self) -> TensorType:
+        raise ValueError("ENtropy not defined for `TorchSquashedGaussian`.")
+
+    @override(TorchDistribution)
+    def kl(self, other: Distribution) -> TensorType:
+        raise ValueError("KL not defined for `TorchSquashedGaussian`.")
+
+    def _squash(self, sample: TensorType) -> TensorType:
+        # Rescale the sample to interval given by the bounds (including the bounds).
+        sample = ((torch.tanh(sample) + 1.0) / 2.0) * (self.high - self.low) + self.low
+        # Return a clipped sample to comply with the bounds.
+        return torch.clamp(sample, self.low, self.high)
+
+    def _unsquash(self, sample: TensorType) -> TensorType:
+        # Rescale to [-1.0, 1.0].
+        sample = (sample - self.low) / (self.high - self.low) * 2.0 - 1.0
+        # Stabilize input to atanh function.
+        sample = torch.clamp(sample, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER)
+        return torch.atanh(sample)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.Box), space
+        return int(np.prod(space.shape, dtype=np.int32) * 2)
+
+    @classmethod
+    @override(TorchDistribution)
+    def from_logits(
+        cls, logits: TensorType, low: float = -1.0, high: float = 1.0, **kwargs
+    ) -> "TorchSquashedGaussian":
+        loc, log_std = logits.chunk(2, dim=-1)
+        # Clip the `scale` values (coming from the `RLModule.forward()`) to
+        # reasonable values.
+        log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT)
+        scale = log_std.exp()
+
+        # Assert that `low` is smaller than `high`.
+        assert np.all(np.less(low, high))
+        # Return class instance.
+        return TorchSquashedGaussian(loc=loc, scale=scale, low=low, high=high)
+
+    def to_deterministic(self) -> Distribution:
+        return TorchDeterministic(loc=self.loc)
+
+
+@DeveloperAPI
+class TorchDeterministic(Distribution):
+    """The distribution that returns the input values directly.
+
+    This is similar to DiagGaussian with standard deviation zero (thus only
+    requiring the "mean" values as NN output).
+
+    Note: entropy is always zero, ang logp and kl are not implemented.
+
+    .. testcode::
+        :skipif: True
+
+        m = TorchDeterministic(loc=torch.tensor([0.0, 0.0]))
+        m.sample(sample_shape=(2,))
+
+    .. testoutput::
+
+        tensor([[ 0.0, 0.0], [ 0.0, 0.0]])
+
+    Args:
+        loc: the determinsitic value to return
+    """
+
+    @override(Distribution)
+    def __init__(self, loc: "torch.Tensor") -> None:
+        super().__init__()
+        self.loc = loc
+
+    @override(Distribution)
+    def sample(
+        self,
+        *,
+        sample_shape=None,
+        **kwargs,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        device = self.loc.device
+        dtype = self.loc.dtype
+        shape = (
+            sample_shape if sample_shape is not None else torch.Size()
+        ) + self.loc.shape
+        return torch.ones(shape, device=device, dtype=dtype) * self.loc
+
+    def rsample(
+        self,
+        *,
+        sample_shape: Tuple[int, ...] = None,
+        **kwargs,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        raise NotImplementedError
+
+    @override(Distribution)
+    def logp(self, value: TensorType, **kwargs) -> TensorType:
+        return torch.zeros_like(self.loc)
+
+    @override(Distribution)
+    def entropy(self, **kwargs) -> TensorType:
+        raise RuntimeError(f"`entropy()` not supported for {self.__class__.__name__}.")
+
+    @override(Distribution)
+    def kl(self, other: "Distribution", **kwargs) -> TensorType:
+        raise RuntimeError(f"`kl()` not supported for {self.__class__.__name__}.")
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.Box)
+        return int(np.prod(space.shape, dtype=np.int32))
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(cls, logits: TensorType, **kwargs) -> "TorchDeterministic":
+        return TorchDeterministic(loc=logits)
+
+    def to_deterministic(self) -> "TorchDeterministic":
+        return self
+
+
+@DeveloperAPI
+class TorchMultiCategorical(Distribution):
+    """MultiCategorical distribution for MultiDiscrete action spaces."""
+
+    @override(Distribution)
+    def __init__(
+        self,
+        categoricals: List[TorchCategorical],
+    ):
+        super().__init__()
+        self._cats = categoricals
+
+    @override(Distribution)
+    def sample(self) -> TensorType:
+        arr = [cat.sample() for cat in self._cats]
+        sample_ = torch.stack(arr, dim=-1)
+        return sample_
+
+    @override(Distribution)
+    def rsample(self, sample_shape=()):
+        arr = [cat.rsample() for cat in self._cats]
+        sample_ = torch.stack(arr, dim=-1)
+        return sample_
+
+    @override(Distribution)
+    def logp(self, value: "torch.Tensor") -> TensorType:
+        value = torch.unbind(value, dim=-1)
+        logps = torch.stack([cat.logp(act) for cat, act in zip(self._cats, value)])
+        return torch.sum(logps, dim=0)
+
+    @override(Distribution)
+    def entropy(self) -> TensorType:
+        return torch.sum(
+            torch.stack([cat.entropy() for cat in self._cats], dim=-1), dim=-1
+        )
+
+    @override(Distribution)
+    def kl(self, other: Distribution) -> TensorType:
+        kls = torch.stack(
+            [cat.kl(oth_cat) for cat, oth_cat in zip(self._cats, other._cats)],
+            dim=-1,
+        )
+        return torch.sum(kls, dim=-1)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.MultiDiscrete)
+        return int(np.sum(space.nvec))
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(
+        cls,
+        logits: "torch.Tensor",
+        input_lens: List[int],
+        temperatures: List[float] = None,
+        **kwargs,
+    ) -> "TorchMultiCategorical":
+        """Creates this Distribution from logits (and additional arguments).
+
+        If you wish to create this distribution from logits only, please refer to
+        `Distribution.get_partial_dist_cls()`.
+
+        Args:
+            logits: The tensor containing logits to be separated by logit_lens.
+                child_distribution_cls_struct: A struct of Distribution classes that can
+                be instantiated from the given logits.
+            input_lens: A list of integers that indicate the length of the logits
+                vectors to be passed into each child distribution.
+            temperatures: A list of floats representing the temperature to use for
+                each Categorical distribution. If not provided, 1.0 is used for all.
+            **kwargs: Forward compatibility kwargs.
+        """
+        if not temperatures:
+            # If temperatures are not provided, use 1.0 for all actions.
+            temperatures = [1.0] * len(input_lens)
+
+        assert (
+            sum(input_lens) == logits.shape[-1]
+        ), "input_lens must sum to logits.shape[-1]"
+        assert len(input_lens) == len(
+            temperatures
+        ), "input_lens and temperatures must be same length"
+
+        categoricals = [
+            TorchCategorical(logits=logits)
+            for logits in torch.split(logits, input_lens, dim=-1)
+        ]
+
+        return TorchMultiCategorical(categoricals=categoricals)
+
+    def to_deterministic(self) -> "TorchDeterministic":
+        if self._cats[0].probs is not None:
+            probs_or_logits = nn.utils.rnn.pad_sequence(
+                [cat.logits.t() for cat in self._cats], padding_value=-torch.inf
+            )
+        else:
+            probs_or_logits = nn.utils.rnn.pad_sequence(
+                [cat.logits.t() for cat in self._cats], padding_value=-torch.inf
+            )
+
+        return TorchDeterministic(loc=torch.argmax(probs_or_logits, dim=0))
+
+
+@DeveloperAPI
+class TorchMultiDistribution(Distribution):
+    """Action distribution that operates on multiple, possibly nested actions."""
+
+    def __init__(
+        self,
+        child_distribution_struct: Union[Tuple, List, Dict],
+    ):
+        """Initializes a TorchMultiActionDistribution object.
+
+        Args:
+            child_distribution_struct: A complex struct that contains the child
+                distribution instances that make up this multi-distribution.
+        """
+        super().__init__()
+        self._original_struct = child_distribution_struct
+        self._flat_child_distributions = tree.flatten(child_distribution_struct)
+
+    @override(Distribution)
+    def rsample(
+        self,
+        *,
+        sample_shape: Tuple[int, ...] = None,
+        **kwargs,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        rsamples = []
+        for dist in self._flat_child_distributions:
+            rsample = dist.rsample(sample_shape=sample_shape, **kwargs)
+            rsamples.append(rsample)
+
+        rsamples = tree.unflatten_as(self._original_struct, rsamples)
+
+        return rsamples
+
+    @override(Distribution)
+    def logp(self, value: TensorType) -> TensorType:
+        # Different places in RLlib use this method with different inputs.
+        # We therefore need to handle a flattened and concatenated input, as well as
+        # a nested one.
+        # TODO(Artur): Deprecate tensor inputs, only allow nested structures.
+        if isinstance(value, torch.Tensor):
+            split_indices = []
+            for dist in self._flat_child_distributions:
+                if isinstance(dist, TorchCategorical):
+                    split_indices.append(1)
+                elif isinstance(dist, TorchMultiCategorical):
+                    split_indices.append(len(dist._cats))
+                else:
+                    sample = dist.sample()
+                    # Cover Box(shape=()) case.
+                    if len(sample.shape) == 1:
+                        split_indices.append(1)
+                    else:
+                        split_indices.append(sample.size()[1])
+            split_value = list(torch.split(value, split_indices, dim=1))
+        else:
+            split_value = tree.flatten(value)
+
+        def map_(val, dist):
+            # Remove extra dimension if present.
+            if (
+                isinstance(dist, TorchCategorical)
+                and val.shape[-1] == 1
+                and len(val.shape) > 1
+            ):
+                val = torch.squeeze(val, dim=-1)
+            return dist.logp(val)
+
+        flat_logps = tree.map_structure(
+            map_, split_value, self._flat_child_distributions
+        )
+
+        return sum(flat_logps)
+
+    @override(Distribution)
+    def kl(self, other: Distribution) -> TensorType:
+        kl_list = [
+            d.kl(o)
+            for d, o in zip(
+                self._flat_child_distributions, other._flat_child_distributions
+            )
+        ]
+        return sum(kl_list)
+
+    @override(Distribution)
+    def entropy(self):
+        entropy_list = [d.entropy() for d in self._flat_child_distributions]
+        return sum(entropy_list)
+
+    @override(Distribution)
+    def sample(self):
+        child_distributions_struct = tree.unflatten_as(
+            self._original_struct, self._flat_child_distributions
+        )
+        return tree.map_structure(lambda s: s.sample(), child_distributions_struct)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(
+        space: gym.Space, input_lens: List[int], as_list: bool = False, **kwargs
+    ) -> int:
+        if as_list:
+            return input_lens
+        else:
+            return sum(input_lens)
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(
+        cls,
+        logits: "torch.Tensor",
+        child_distribution_cls_struct: Union[Dict, Iterable],
+        input_lens: Union[Dict, List[int]],
+        **kwargs,
+    ) -> "TorchMultiDistribution":
+        """Creates this Distribution from logits (and additional arguments).
+
+        If you wish to create this distribution from logits only, please refer to
+        `Distribution.get_partial_dist_cls()`.
+
+        Args:
+            logits: The tensor containing logits to be separated by `input_lens`.
+                child_distribution_cls_struct: A struct of Distribution classes that can
+                be instantiated from the given logits.
+            child_distribution_cls_struct: A struct of Distribution classes that can
+                be instantiated from the given logits.
+            input_lens: A list or dict of integers that indicate the length of each
+                logit. If this is given as a dict, the structure should match the
+                structure of child_distribution_cls_struct.
+            **kwargs: Forward compatibility kwargs.
+
+        Returns:
+            A TorchMultiActionDistribution object.
+        """
+        logit_lens = tree.flatten(input_lens)
+        child_distribution_cls_list = tree.flatten(child_distribution_cls_struct)
+        split_logits = torch.split(logits, logit_lens, dim=-1)
+
+        child_distribution_list = tree.map_structure(
+            lambda dist, input_: dist.from_logits(input_),
+            child_distribution_cls_list,
+            list(split_logits),
+        )
+
+        child_distribution_struct = tree.unflatten_as(
+            child_distribution_cls_struct, child_distribution_list
+        )
+
+        return TorchMultiDistribution(
+            child_distribution_struct=child_distribution_struct,
+        )
+
+    def to_deterministic(self) -> "TorchMultiDistribution":
+        flat_deterministic_dists = [
+            dist.to_deterministic() for dist in self._flat_child_distributions
+        ]
+        deterministic_dists = tree.unflatten_as(
+            self._original_struct, flat_deterministic_dists
+        )
+        return TorchMultiDistribution(deterministic_dists)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_modelv2.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_modelv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd473c70de3ef91384306ef7c23e58314179ba27
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/torch_modelv2.py
@@ -0,0 +1,80 @@
+import gymnasium as gym
+from typing import Dict, List, Union
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+
+_, nn = try_import_torch()
+
+
+@OldAPIStack
+class TorchModelV2(ModelV2):
+    """Torch version of ModelV2.
+
+    Note that this class by itself is not a valid model unless you
+    inherit from nn.Module and implement forward() in a subclass."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        """Initialize a TorchModelV2.
+
+        Here is an example implementation for a subclass
+        ``MyModelClass(TorchModelV2, nn.Module)``::
+
+            def __init__(self, *args, **kwargs):
+                TorchModelV2.__init__(self, *args, **kwargs)
+                nn.Module.__init__(self)
+                self._hidden_layers = nn.Sequential(...)
+                self._logits = ...
+                self._value_branch = ...
+        """
+        if not isinstance(self, nn.Module):
+            raise ValueError(
+                "Subclasses of TorchModelV2 must also inherit from "
+                "nn.Module, e.g., MyModel(TorchModelV2, nn.Module)"
+            )
+
+        ModelV2.__init__(
+            self,
+            obs_space,
+            action_space,
+            num_outputs,
+            model_config,
+            name,
+            framework="torch",
+        )
+
+        # Dict to store per multi-gpu tower stats into.
+        # In PyTorch multi-GPU, we use a single TorchPolicy and copy
+        # it's Model(s) n times (1 copy for each GPU). When computing the loss
+        # on each tower, we cannot store the stats (e.g. `entropy`) inside the
+        # policy object as this would lead to race conditions between the
+        # different towers all accessing the same property at the same time.
+        self.tower_stats = {}
+
+    @override(ModelV2)
+    def variables(
+        self, as_dict: bool = False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
+        p = list(self.parameters())
+        if as_dict:
+            return {k: p[i] for i, k in enumerate(self.state_dict().keys())}
+        return p
+
+    @override(ModelV2)
+    def trainable_variables(
+        self, as_dict: bool = False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
+        if as_dict:
+            return {
+                k: v for k, v in self.variables(as_dict=True).items() if v.requires_grad
+            }
+        return [v for v in self.variables() if v.requires_grad]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/visionnet.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/visionnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..748ba5796e3bfb76cc0c04a4a2d14cffaede48ab
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/torch/visionnet.py
@@ -0,0 +1,293 @@
+import numpy as np
+from typing import Dict, List
+import gymnasium as gym
+
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.misc import (
+    normc_initializer,
+    same_padding,
+    SlimConv2d,
+    SlimFC,
+)
+from ray.rllib.models.utils import get_activation_fn, get_filter_config
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+
+torch, nn = try_import_torch()
+
+
+@OldAPIStack
+class VisionNetwork(TorchModelV2, nn.Module):
+    """Generic vision network."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        if not model_config.get("conv_filters"):
+            model_config["conv_filters"] = get_filter_config(obs_space.shape)
+
+        TorchModelV2.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        activation = self.model_config.get("conv_activation")
+        filters = self.model_config["conv_filters"]
+        assert len(filters) > 0, "Must provide at least 1 entry in `conv_filters`!"
+
+        # Post FC net config.
+        post_fcnet_hiddens = model_config.get("post_fcnet_hiddens", [])
+        post_fcnet_activation = get_activation_fn(
+            model_config.get("post_fcnet_activation"), framework="torch"
+        )
+
+        no_final_linear = self.model_config.get("no_final_linear")
+        vf_share_layers = self.model_config.get("vf_share_layers")
+
+        # Whether the last layer is the output of a Flattened (rather than
+        # a n x (1,1) Conv2D).
+        self.last_layer_is_flattened = False
+        self._logits = None
+
+        layers = []
+        (w, h, in_channels) = obs_space.shape
+
+        in_size = [w, h]
+        for out_channels, kernel, stride in filters[:-1]:
+            padding, out_size = same_padding(in_size, kernel, stride)
+            layers.append(
+                SlimConv2d(
+                    in_channels,
+                    out_channels,
+                    kernel,
+                    stride,
+                    padding,
+                    activation_fn=activation,
+                )
+            )
+            in_channels = out_channels
+            in_size = out_size
+
+        out_channels, kernel, stride = filters[-1]
+
+        # No final linear: Last layer has activation function and exits with
+        # num_outputs nodes (this could be a 1x1 conv or a FC layer, depending
+        # on `post_fcnet_...` settings).
+        if no_final_linear and num_outputs:
+            out_channels = out_channels if post_fcnet_hiddens else num_outputs
+            layers.append(
+                SlimConv2d(
+                    in_channels,
+                    out_channels,
+                    kernel,
+                    stride,
+                    None,  # padding=valid
+                    activation_fn=activation,
+                )
+            )
+
+            # Add (optional) post-fc-stack after last Conv2D layer.
+            layer_sizes = post_fcnet_hiddens[:-1] + (
+                [num_outputs] if post_fcnet_hiddens else []
+            )
+            for i, out_size in enumerate(layer_sizes):
+                layers.append(
+                    SlimFC(
+                        in_size=out_channels,
+                        out_size=out_size,
+                        activation_fn=post_fcnet_activation,
+                        initializer=normc_initializer(1.0),
+                    )
+                )
+                out_channels = out_size
+
+        # Finish network normally (w/o overriding last layer size with
+        # `num_outputs`), then add another linear one of size `num_outputs`.
+        else:
+            layers.append(
+                SlimConv2d(
+                    in_channels,
+                    out_channels,
+                    kernel,
+                    stride,
+                    None,  # padding=valid
+                    activation_fn=activation,
+                )
+            )
+
+            # num_outputs defined. Use that to create an exact
+            # `num_output`-sized (1,1)-Conv2D.
+            if num_outputs:
+                in_size = [
+                    np.ceil((in_size[0] - kernel[0]) / stride),
+                    np.ceil((in_size[1] - kernel[1]) / stride),
+                ]
+                padding, _ = same_padding(in_size, [1, 1], [1, 1])
+                if post_fcnet_hiddens:
+                    layers.append(nn.Flatten())
+                    in_size = out_channels
+                    # Add (optional) post-fc-stack after last Conv2D layer.
+                    for i, out_size in enumerate(post_fcnet_hiddens + [num_outputs]):
+                        layers.append(
+                            SlimFC(
+                                in_size=in_size,
+                                out_size=out_size,
+                                activation_fn=post_fcnet_activation
+                                if i < len(post_fcnet_hiddens) - 1
+                                else None,
+                                initializer=normc_initializer(1.0),
+                            )
+                        )
+                        in_size = out_size
+                    # Last layer is logits layer.
+                    self._logits = layers.pop()
+
+                else:
+                    self._logits = SlimConv2d(
+                        out_channels,
+                        num_outputs,
+                        [1, 1],
+                        1,
+                        padding,
+                        activation_fn=None,
+                    )
+
+            # num_outputs not known -> Flatten, then set self.num_outputs
+            # to the resulting number of nodes.
+            else:
+                self.last_layer_is_flattened = True
+                layers.append(nn.Flatten())
+
+        self._convs = nn.Sequential(*layers)
+
+        # If our num_outputs still unknown, we need to do a test pass to
+        # figure out the output dimensions. This could be the case, if we have
+        # the Flatten layer at the end.
+        if self.num_outputs is None:
+            # Create a B=1 dummy sample and push it through out conv-net.
+            dummy_in = (
+                torch.from_numpy(self.obs_space.sample())
+                .permute(2, 0, 1)
+                .unsqueeze(0)
+                .float()
+            )
+            dummy_out = self._convs(dummy_in)
+            self.num_outputs = dummy_out.shape[1]
+
+        # Build the value layers
+        self._value_branch_separate = self._value_branch = None
+        if vf_share_layers:
+            self._value_branch = SlimFC(
+                out_channels, 1, initializer=normc_initializer(0.01), activation_fn=None
+            )
+        else:
+            vf_layers = []
+            (w, h, in_channels) = obs_space.shape
+            in_size = [w, h]
+            for out_channels, kernel, stride in filters[:-1]:
+                padding, out_size = same_padding(in_size, kernel, stride)
+                vf_layers.append(
+                    SlimConv2d(
+                        in_channels,
+                        out_channels,
+                        kernel,
+                        stride,
+                        padding,
+                        activation_fn=activation,
+                    )
+                )
+                in_channels = out_channels
+                in_size = out_size
+
+            out_channels, kernel, stride = filters[-1]
+            vf_layers.append(
+                SlimConv2d(
+                    in_channels,
+                    out_channels,
+                    kernel,
+                    stride,
+                    None,
+                    activation_fn=activation,
+                )
+            )
+
+            vf_layers.append(
+                SlimConv2d(
+                    in_channels=out_channels,
+                    out_channels=1,
+                    kernel=1,
+                    stride=1,
+                    padding=None,
+                    activation_fn=None,
+                )
+            )
+            self._value_branch_separate = nn.Sequential(*vf_layers)
+
+        # Holds the current "base" output (before logits layer).
+        self._features = None
+
+    @override(TorchModelV2)
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> (TensorType, List[TensorType]):
+        self._features = input_dict["obs"].float()
+        # Permuate b/c data comes in as [B, dim, dim, channels]:
+        self._features = self._features.permute(0, 3, 1, 2)
+        conv_out = self._convs(self._features)
+        # Store features to save forward pass when getting value_function out.
+        if not self._value_branch_separate:
+            self._features = conv_out
+
+        if not self.last_layer_is_flattened:
+            if self._logits:
+                conv_out = self._logits(conv_out)
+            if len(conv_out.shape) == 4:
+                if conv_out.shape[2] != 1 or conv_out.shape[3] != 1:
+                    raise ValueError(
+                        "Given `conv_filters` ({}) do not result in a [B, {} "
+                        "(`num_outputs`), 1, 1] shape (but in {})! Please "
+                        "adjust your Conv2D stack such that the last 2 dims "
+                        "are both 1.".format(
+                            self.model_config["conv_filters"],
+                            self.num_outputs,
+                            list(conv_out.shape),
+                        )
+                    )
+                logits = conv_out.squeeze(3)
+                logits = logits.squeeze(2)
+            else:
+                logits = conv_out
+            return logits, state
+        else:
+            return conv_out, state
+
+    @override(TorchModelV2)
+    def value_function(self) -> TensorType:
+        assert self._features is not None, "must call forward() first"
+        if self._value_branch_separate:
+            value = self._value_branch_separate(self._features)
+            value = value.squeeze(3)
+            value = value.squeeze(2)
+            return value.squeeze(1)
+        else:
+            if not self.last_layer_is_flattened:
+                features = self._features.squeeze(3)
+                features = features.squeeze(2)
+            else:
+                features = self._features
+            return self._value_branch(features).squeeze(1)
+
+    def _hidden_layers(self, obs: TensorType) -> TensorType:
+        res = self._convs(obs.permute(0, 3, 1, 2))  # switch to channel-major
+        res = res.squeeze(3)
+        res = res.squeeze(2)
+        return res