diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e730866595c116798a29cb83852fcc2ac0094ec0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5839518d4863e03ad2f4c253a0fe798279cd0dca
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/clip.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/clip.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ed977ab10163a9c5c0f8748a000d5662887991b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/clip.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/immutable.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/immutable.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f010f33588579f2bc564d01233eedb12281f7bc0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/immutable.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/lambdas.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/lambdas.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8190bece945b48612ac43b898171cce588479e9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/lambdas.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/normalize.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/normalize.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24e3f66443ac1e511446ea5ee46410fb5b6ee547
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/normalize.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..781bea611eaa5a122a4a13eff9a54cdb0a906a56
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/__pycache__/pipeline.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/clip.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..da7c8b97bf927bf4d97c7feed8285faa55ac89cf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/clip.py
@@ -0,0 +1,41 @@
+from typing import Any
+
+from ray.rllib.connectors.connector import (
+    ActionConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.utils.spaces.space_utils import clip_action, get_base_struct_from_space
+from ray.rllib.utils.typing import ActionConnectorDataType
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+class ClipActionsConnector(ActionConnector):
+    def __init__(self, ctx: ConnectorContext):
+        super().__init__(ctx)
+
+        self._action_space_struct = get_base_struct_from_space(ctx.action_space)
+
+    def transform(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType:
+        assert isinstance(
+            ac_data.output, tuple
+        ), "Action connector requires PolicyOutputType data."
+
+        actions, states, fetches = ac_data.output
+        return ActionConnectorDataType(
+            ac_data.env_id,
+            ac_data.agent_id,
+            ac_data.input_dict,
+            (clip_action(actions, self._action_space_struct), states, fetches),
+        )
+
+    def to_state(self):
+        return ClipActionsConnector.__name__, None
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        return ClipActionsConnector(ctx)
+
+
+register_connector(ClipActionsConnector.__name__, ClipActionsConnector)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/immutable.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/immutable.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f5c8bbd197cb6345cf07e712f2477897e25766b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/immutable.py
@@ -0,0 +1,40 @@
+from typing import Any
+
+import tree  # pip install dm_tree
+
+from ray.rllib.connectors.connector import (
+    ActionConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.utils.numpy import make_action_immutable
+from ray.rllib.utils.typing import ActionConnectorDataType
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+class ImmutableActionsConnector(ActionConnector):
+    def transform(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType:
+        assert isinstance(
+            ac_data.output, tuple
+        ), "Action connector requires PolicyOutputType data."
+
+        actions, states, fetches = ac_data.output
+        tree.traverse(make_action_immutable, actions, top_down=False)
+
+        return ActionConnectorDataType(
+            ac_data.env_id,
+            ac_data.agent_id,
+            ac_data.input_dict,
+            (actions, states, fetches),
+        )
+
+    def to_state(self):
+        return ImmutableActionsConnector.__name__, None
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        return ImmutableActionsConnector(ctx)
+
+
+register_connector(ImmutableActionsConnector.__name__, ImmutableActionsConnector)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/lambdas.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/lambdas.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf862dd834d57e8b77a677c5f6fa7df8755779e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/lambdas.py
@@ -0,0 +1,76 @@
+from typing import Any, Callable, Dict, Type
+
+from ray.rllib.connectors.connector import (
+    ActionConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import (
+    ActionConnectorDataType,
+    PolicyOutputType,
+    StateBatches,
+    TensorStructType,
+)
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+def register_lambda_action_connector(
+    name: str, fn: Callable[[TensorStructType, StateBatches, Dict], PolicyOutputType]
+) -> Type[ActionConnector]:
+    """A util to register any function transforming PolicyOutputType as an ActionConnector.
+
+    The only requirement is that fn should take actions, states, and fetches as input,
+    and return transformed actions, states, and fetches.
+
+    Args:
+        name: Name of the resulting actor connector.
+        fn: The function that transforms PolicyOutputType.
+
+    Returns:
+        A new ActionConnector class that transforms PolicyOutputType using fn.
+    """
+
+    class LambdaActionConnector(ActionConnector):
+        def transform(
+            self, ac_data: ActionConnectorDataType
+        ) -> ActionConnectorDataType:
+            assert isinstance(
+                ac_data.output, tuple
+            ), "Action connector requires PolicyOutputType data."
+
+            actions, states, fetches = ac_data.output
+            return ActionConnectorDataType(
+                ac_data.env_id,
+                ac_data.agent_id,
+                ac_data.input_dict,
+                fn(actions, states, fetches),
+            )
+
+        def to_state(self):
+            return name, None
+
+        @staticmethod
+        def from_state(ctx: ConnectorContext, params: Any):
+            return LambdaActionConnector(ctx)
+
+    LambdaActionConnector.__name__ = name
+    LambdaActionConnector.__qualname__ = name
+
+    register_connector(name, LambdaActionConnector)
+
+    return LambdaActionConnector
+
+
+# Convert actions and states into numpy arrays if necessary.
+ConvertToNumpyConnector = OldAPIStack(
+    register_lambda_action_connector(
+        "ConvertToNumpyConnector",
+        lambda actions, states, fetches: (
+            convert_to_numpy(actions),
+            convert_to_numpy(states),
+            fetches,
+        ),
+    ),
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/normalize.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..67c3731469a76556350cd327f276f82765a22cff
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/normalize.py
@@ -0,0 +1,44 @@
+from typing import Any
+
+from ray.rllib.connectors.connector import (
+    ActionConnector,
+    ConnectorContext,
+)
+from ray.rllib.connectors.registry import register_connector
+from ray.rllib.utils.spaces.space_utils import (
+    get_base_struct_from_space,
+    unsquash_action,
+)
+from ray.rllib.utils.typing import ActionConnectorDataType
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+@OldAPIStack
+class NormalizeActionsConnector(ActionConnector):
+    def __init__(self, ctx: ConnectorContext):
+        super().__init__(ctx)
+
+        self._action_space_struct = get_base_struct_from_space(ctx.action_space)
+
+    def transform(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType:
+        assert isinstance(
+            ac_data.output, tuple
+        ), "Action connector requires PolicyOutputType data."
+
+        actions, states, fetches = ac_data.output
+        return ActionConnectorDataType(
+            ac_data.env_id,
+            ac_data.agent_id,
+            ac_data.input_dict,
+            (unsquash_action(actions, self._action_space_struct), states, fetches),
+        )
+
+    def to_state(self):
+        return NormalizeActionsConnector.__name__, None
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        return NormalizeActionsConnector(ctx)
+
+
+register_connector(NormalizeActionsConnector.__name__, NormalizeActionsConnector)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/pipeline.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..a93fd3eb340ebd40fb5c7ec23b0bf5960735b992
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/action/pipeline.py
@@ -0,0 +1,61 @@
+import logging
+from typing import Any, List
+from collections import defaultdict
+
+from ray.rllib.connectors.connector import (
+    ActionConnector,
+    Connector,
+    ConnectorContext,
+    ConnectorPipeline,
+)
+from ray.rllib.connectors.registry import get_connector, register_connector
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.typing import ActionConnectorDataType
+from ray.util.timer import _Timer
+
+
+logger = logging.getLogger(__name__)
+
+
+@OldAPIStack
+class ActionConnectorPipeline(ConnectorPipeline, ActionConnector):
+    def __init__(self, ctx: ConnectorContext, connectors: List[Connector]):
+        super().__init__(ctx, connectors)
+        self.timers = defaultdict(_Timer)
+
+    def __call__(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType:
+        for c in self.connectors:
+            timer = self.timers[str(c)]
+            with timer:
+                ac_data = c(ac_data)
+        return ac_data
+
+    def to_state(self):
+        children = []
+        for c in self.connectors:
+            state = c.to_state()
+            assert isinstance(state, tuple) and len(state) == 2, (
+                "Serialized connector state must be in the format of "
+                f"Tuple[name: str, params: Any]. Instead we got {state}"
+                f"for connector {c.__name__}."
+            )
+            children.append(state)
+        return ActionConnectorPipeline.__name__, children
+
+    @staticmethod
+    def from_state(ctx: ConnectorContext, params: Any):
+        assert (
+            type(params) is list
+        ), "ActionConnectorPipeline takes a list of connector params."
+        connectors = []
+        for state in params:
+            try:
+                name, subparams = state
+                connectors.append(get_connector(name, ctx, subparams))
+            except Exception as e:
+                logger.error(f"Failed to de-serialize connector state: {state}")
+                raise e
+        return ActionConnectorPipeline(ctx, connectors)
+
+
+register_connector(ActionConnectorPipeline.__name__, ActionConnectorPipeline)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3eddb42be53f95b6fa0d13494d746f110fba37a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/clip_reward.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/clip_reward.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7286021251ba9629eccf2b4fbd23cd69f293291e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/clip_reward.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/env_sampling.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/env_sampling.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3384eb2840d4c8c6cce8539ffbaf57c1c7c188d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/env_sampling.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a0862f46d8d07d38277d67e1f34834f079cda10
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/pipeline.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/synced_filter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/synced_filter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..296c43114244772df4105befb5360ca3dcb11e00
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/synced_filter.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/view_requirement.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/view_requirement.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ca2390f5eec0d6a30e6153446c0906241f02d25
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/agent/__pycache__/view_requirement.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf28ba9ae9fbc2924b9882a5ff5d476a772a2697
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__init__.py
@@ -0,0 +1,22 @@
+from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import (
+    AddObservationsFromEpisodesToBatch,
+)
+from ray.rllib.connectors.common.add_states_from_episodes_to_batch import (
+    AddStatesFromEpisodesToBatch,
+)
+from ray.rllib.connectors.common.add_time_dim_to_batch_and_zero_pad import (
+    AddTimeDimToBatchAndZeroPad,
+)
+from ray.rllib.connectors.common.agent_to_module_mapping import AgentToModuleMapping
+from ray.rllib.connectors.common.batch_individual_items import BatchIndividualItems
+from ray.rllib.connectors.common.numpy_to_tensor import NumpyToTensor
+
+
+__all__ = [
+    "AddObservationsFromEpisodesToBatch",
+    "AddStatesFromEpisodesToBatch",
+    "AddTimeDimToBatchAndZeroPad",
+    "AgentToModuleMapping",
+    "BatchIndividualItems",
+    "NumpyToTensor",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3f8444307c76391aed886fd63d73ac52a2414a2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_observations_from_episodes_to_batch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_observations_from_episodes_to_batch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f995a98ed27a3c6bbc6d9a39af8dec4f8039044f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_observations_from_episodes_to_batch.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_states_from_episodes_to_batch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_states_from_episodes_to_batch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8aef8133e225593f94c9054b6ba21f4418713ebc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_states_from_episodes_to_batch.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_time_dim_to_batch_and_zero_pad.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_time_dim_to_batch_and_zero_pad.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a189e908db71fafccb06e2ced960aa195b7a6ea
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/add_time_dim_to_batch_and_zero_pad.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/agent_to_module_mapping.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/agent_to_module_mapping.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb5c5e4682429cb6e682657fe519e8fb6de5f61f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/agent_to_module_mapping.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/batch_individual_items.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/batch_individual_items.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..000def7a4eb1a79b21d3198cbb38bb503b4859ad
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/batch_individual_items.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/frame_stacking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/frame_stacking.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f258c70bf3405caffe84ee44a3f93f91390ac4d7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/frame_stacking.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/module_to_agent_unmapping.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/module_to_agent_unmapping.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..133d91fbda8ab0617407d13e43ea03ce804121d3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/module_to_agent_unmapping.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/numpy_to_tensor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/numpy_to_tensor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e5a6030c2b226a254e44eb0c7c537b3a45c0892
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/numpy_to_tensor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/tensor_to_numpy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/tensor_to_numpy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71dc34c995da6f677f084ba30a8b29575cb2b607
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/__pycache__/tensor_to_numpy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_observations_from_episodes_to_batch.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_observations_from_episodes_to_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..54fab7b064c55fd00621cb0ebf052690da63a698
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_observations_from_episodes_to_batch.py
@@ -0,0 +1,180 @@
+from typing import Any, Dict, List, Optional
+
+import gymnasium as gym
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class AddObservationsFromEpisodesToBatch(ConnectorV2):
+    """Gets the last observation from a running episode and adds it to the batch.
+
+    Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that
+    are added automatically by RLlib into every env-to-module/Learner connector
+    pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or
+    `config.add_default_connectors_to_learner_pipeline ` are set to
+    False.
+
+    The default env-to-module connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+    The default Learner connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddColumnsFromEpisodesToTrainBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+
+    This ConnectorV2:
+    - Operates on a list of Episode objects (single- or multi-agent).
+    - Gets the most recent observation(s) from all the given episodes and adds them
+    to the batch under construction (as a list of individual observations).
+    - Does NOT alter any observations (or other data) in the given episodes.
+    - Can be used in EnvToModule and Learner connector pipelines.
+
+    .. testcode::
+
+        import gymnasium as gym
+        import numpy as np
+
+        from ray.rllib.connectors.common import AddObservationsFromEpisodesToBatch
+        from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+        from ray.rllib.utils.test_utils import check
+
+        # Create two dummy SingleAgentEpisodes, each containing 2 observations,
+        # 1 action and 1 reward (both are length=1).
+        obs_space = gym.spaces.Box(-1.0, 1.0, (2,), np.float32)
+        act_space = gym.spaces.Discrete(2)
+
+        episodes = [SingleAgentEpisode(
+            observations=[obs_space.sample(), obs_space.sample()],
+            actions=[act_space.sample()],
+            rewards=[1.0],
+            len_lookback_buffer=0,
+        ) for _ in range(2)]
+        eps_1_last_obs = episodes[0].get_observations(-1)
+        eps_2_last_obs = episodes[1].get_observations(-1)
+        print(f"1st Episode's last obs is {eps_1_last_obs}")
+        print(f"2nd Episode's last obs is {eps_2_last_obs}")
+
+        # Create an instance of this class.
+        connector = AddObservationsFromEpisodesToBatch()
+
+        # Call the connector with the two created episodes.
+        # Note that this particular connector works without an RLModule, so we
+        # simplify here for the sake of this example.
+        output_batch = connector(
+            rl_module=None,
+            batch={},
+            episodes=episodes,
+            explore=True,
+            shared_data={},
+        )
+        # The output data should now contain the last observations of both episodes,
+        # in a "per-episode organized" fashion.
+        check(
+            output_batch,
+            {
+                "obs": {
+                    (episodes[0].id_,): [eps_1_last_obs],
+                    (episodes[1].id_,): [eps_2_last_obs],
+                },
+            },
+        )
+    """
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        as_learner_connector: bool = False,
+        **kwargs,
+    ):
+        """Initializes a AddObservationsFromEpisodesToBatch instance.
+
+        Args:
+            as_learner_connector: Whether this connector is part of a Learner connector
+                pipeline, as opposed to a env-to-module pipeline. As a Learner
+                connector, it will add an entire Episode's observations (each timestep)
+                to the batch.
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            **kwargs,
+        )
+
+        self._as_learner_connector = as_learner_connector
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # If "obs" already in data, early out.
+        if Columns.OBS in batch:
+            return batch
+        for i, sa_episode in enumerate(
+            self.single_agent_episode_iterator(
+                episodes,
+                # If Learner connector, get all episodes (for train batch).
+                # If EnvToModule, get only those ongoing episodes that just had their
+                # agent step (b/c those are the ones we need to compute actions for
+                # next).
+                agents_that_stepped_only=not self._as_learner_connector,
+            )
+        ):
+            if self._as_learner_connector:
+                # TODO (sven): Resolve this hack by adding a new connector piece that
+                #  performs this very task.
+                if "_" not in sa_episode.id_:
+                    sa_episode.id_ += "_" + str(i)
+
+                self.add_n_batch_items(
+                    batch,
+                    Columns.OBS,
+                    # Add all observations, except the very last one.
+                    # For a terminated episode, this is the terminal observation that
+                    # has no value for training.
+                    # For a truncated episode, algorithms either add an extra NEXT_OBS
+                    # column to the batch (ex. DQN) or extend the episode length by one
+                    # (using a separate connector piece and this truncated last obs),
+                    # then bootstrap the value estimation for that extra timestep.
+                    items_to_add=sa_episode.get_observations(slice(0, len(sa_episode))),
+                    num_items=len(sa_episode),
+                    single_agent_episode=sa_episode,
+                )
+            else:
+                assert not sa_episode.is_numpy
+                self.add_batch_item(
+                    batch,
+                    Columns.OBS,
+                    item_to_add=sa_episode.get_observations(-1),
+                    single_agent_episode=sa_episode,
+                )
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_states_from_episodes_to_batch.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_states_from_episodes_to_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e211dd255728c0668832eb3fdd65ea94cd4e98a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_states_from_episodes_to_batch.py
@@ -0,0 +1,348 @@
+import math
+from typing import Any, Dict, List, Optional
+
+import gymnasium as gym
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class AddStatesFromEpisodesToBatch(ConnectorV2):
+    """Gets last STATE_OUT from running episode and adds it as STATE_IN to the batch.
+
+    Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that
+    are added automatically by RLlib into every env-to-module/Learner connector
+    pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or
+    `config.add_default_connectors_to_learner_pipeline ` are set to
+    False.
+
+    The default env-to-module connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+    The default Learner connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddColumnsFromEpisodesToTrainBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+
+    If the RLModule is stateful, the episodes' STATE_OUTS will be extracted
+    and restructured under a new STATE_IN key.
+    As a Learner connector, the resulting STATE_IN batch has the shape (B', ...).
+    Here, B' is the sum of splits we have to do over the given episodes, such that each
+    chunk is at most `max_seq_len` long (T-axis).
+    As a EnvToModule connector, the resulting STATE_IN batch simply consists of n
+    states coming from n vectorized environments/episodes.
+
+    Also, all other data (observations, rewards, etc.. if applicable) will be properly
+    reshaped into (B, T=max_seq_len (learner) or 1 (env-to-module), ...) and will be
+    zero-padded, if necessary.
+
+    This ConnectorV2:
+    - Operates on a list of Episode objects.
+    - Gets the most recent STATE_OUT from all the given episodes and adds them under
+    the STATE_IN key to the batch under construction.
+    - Does NOT alter any data in the given episodes.
+    - Can be used in EnvToModule and Learner connector pipelines.
+
+    .. testcode::
+
+        from ray.rllib.connectors.common import AddStatesFromEpisodesToBatch
+        from ray.rllib.core.columns import Columns
+        from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+        from ray.rllib.utils.test_utils import check
+
+        # Create a simple dummy class, pretending to be an RLModule with
+        # `get_initial_state`, `is_stateful` and `model_config` property defined:
+        class MyStateModule:
+            # dummy config
+            model_config = {"max_seq_len": 2}
+
+            def is_stateful(self):
+                return True
+
+            def get_initial_state(self):
+                return 0.0
+
+
+        # Create an empty episode. The connector should use the RLModule's initial state
+        # to populate STATE_IN for the next forward pass.
+        episode = SingleAgentEpisode()
+
+        rl_module = MyStateModule()
+        rl_module_init_state = rl_module.get_initial_state()
+
+        # Create an instance of this class (as a env-to-module connector).
+        connector = AddStatesFromEpisodesToBatch(as_learner_connector=False)
+
+        # Call the connector.
+        output_batch = connector(
+            rl_module=rl_module,
+            batch={},
+            episodes=[episode],
+            shared_data={},
+        )
+        # The output data's STATE_IN key should now contain the RLModule's initial state
+        # plus the one state out found in the episode in a "per-episode organized"
+        # fashion.
+        check(
+            output_batch[Columns.STATE_IN],
+            {
+                (episode.id_,): [rl_module_init_state],
+            },
+        )
+
+        # Create a SingleAgentEpisodes containing 5 observations,
+        # 4 actions and 4 rewards, and 4 STATE_OUTs.
+        # The same connector should now use the episode-stored last STATE_OUT as
+        # STATE_IN for the next forward pass.
+        episode = SingleAgentEpisode(
+            observations=[0, 1, 2, 3, 4],
+            actions=[1, 2, 3, 4],
+            rewards=[1.0, 2.0, 3.0, 4.0],
+            # STATE_OUT in episode will show up under STATE_IN in the batch.
+            extra_model_outputs={
+                Columns.STATE_OUT: [-4.0, -3.0, -2.0, -1.0],
+            },
+            len_lookback_buffer = 0,
+        )
+
+        # Call the connector.
+        output_batch = connector(
+            rl_module=rl_module,
+            batch={},
+            episodes=[episode],
+            shared_data={},
+        )
+        # The output data's STATE_IN key should now contain the episode's last
+        # STATE_OUT, NOT the RLModule's initial state in a "per-episode organized"
+        # fashion.
+        check(
+            output_batch[Columns.STATE_IN],
+            {
+                # Expect the episode's last STATE_OUT.
+                (episode.id_,): [-1.0],
+            },
+        )
+
+        # Create a new connector as a learner connector with a RNN seq len of 2 (for
+        # testing purposes only). Passing the same data through this learner connector,
+        # we expect the STATE_IN data to contain a) the initial module state and then
+        # every 2nd STATE_OUT stored in the episode.
+        connector = AddStatesFromEpisodesToBatch(as_learner_connector=True)
+
+        # Call the connector.
+        output_batch = connector(
+            rl_module=rl_module,
+            batch={},
+            episodes=[episode],
+            shared_data={},
+        )
+        check(
+            output_batch[Columns.STATE_IN],
+            {
+                # Expect initial module state + every 2nd STATE_OUT from episode, but
+                # not the very last one (just like the very last observation, this data
+                # is NOT passed through the forward_train, b/c there is nothing to learn
+                # at that timestep, unless we need to compute e.g. bootstrap value
+                # predictions).
+                # Also note that the different STATE_IN timesteps are already present
+                # as one batched item per episode in the list.
+                (episode.id_,): [rl_module_init_state, -3.0],
+            },
+        )
+    """
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        as_learner_connector: bool = False,
+        **kwargs,
+    ):
+        """Initializes a AddObservationsFromEpisodesToBatch instance.
+
+        Args:
+            as_learner_connector: Whether this connector is part of a Learner connector
+                pipeline, as opposed to a env-to-module pipeline. As a Learner
+                connector, it will add an entire Episode's observations (each timestep)
+                to the batch.
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            **kwargs,
+        )
+
+        self._as_learner_connector = as_learner_connector
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # If not stateful OR STATE_IN already in data, early out.
+        if not rl_module.is_stateful() or Columns.STATE_IN in batch:
+            return batch
+
+        for sa_episode in self.single_agent_episode_iterator(
+            episodes,
+            # If Learner connector, get all episodes (for train batch).
+            # If EnvToModule, get only those ongoing episodes that just had their
+            # agent step (b/c those are the ones we need to compute actions for next).
+            agents_that_stepped_only=not self._as_learner_connector,
+        ):
+            if self._as_learner_connector:
+                # Multi-agent case: Extract correct single agent RLModule (to get its
+                # individual state).
+                if sa_episode.module_id is not None:
+                    sa_module = rl_module[sa_episode.module_id]
+                else:
+                    sa_module = (
+                        rl_module[DEFAULT_MODULE_ID]
+                        if isinstance(rl_module, MultiRLModule)
+                        else rl_module
+                    )
+                # This single-agent RLModule is NOT stateful -> Skip.
+                if not sa_module.is_stateful():
+                    continue
+
+                max_seq_len = sa_module.model_config["max_seq_len"]
+
+                # look_back_state.shape=([state-dim],)
+                look_back_state = (
+                    # Episode has a (reset) beginning -> Prepend initial
+                    # state.
+                    convert_to_numpy(sa_module.get_initial_state())
+                    if sa_episode.t_started == 0
+                    or (Columns.STATE_OUT not in sa_episode.extra_model_outputs)
+                    # Episode starts somewhere in the middle (is a cut
+                    # continuation chunk) -> Use previous chunk's last
+                    # STATE_OUT as initial state.
+                    else sa_episode.get_extra_model_outputs(
+                        key=Columns.STATE_OUT,
+                        indices=-1,
+                        neg_index_as_lookback=True,
+                    )
+                )
+                # If we have `"state_out"`s (e.g. from rollouts) use them for the
+                # `"state_in"`s.
+                if Columns.STATE_OUT in sa_episode.extra_model_outputs:
+                    # state_outs.shape=(T,[state-dim])  T=episode len
+                    state_outs = sa_episode.get_extra_model_outputs(
+                        key=Columns.STATE_OUT
+                    )
+                # Otherwise, we have no `"state_out"` (e.g. because we are sampling
+                # from offline data and the expert policy was not stateful).
+                else:
+                    # Then simply use the `look_back_state`, i.e. in this case the
+                    # initial state as `"state_in` in training.
+                    if sa_episode.is_numpy:
+                        state_outs = tree.map_structure(
+                            lambda a, _sae=sa_episode: np.repeat(
+                                a[np.newaxis, ...], len(_sae), axis=0
+                            ),
+                            look_back_state,
+                        )
+                    else:
+                        state_outs = [look_back_state for _ in range(len(sa_episode))]
+                # Explanation:
+                # B=episode len // max_seq_len
+                # [::max_seq_len]: only keep every Tth state.
+                # [:-1]: Shift state outs by one; ignore very last
+                # STATE_OUT, but therefore add the lookback/init state at
+                # the beginning.
+                items_to_add = (
+                    tree.map_structure(
+                        lambda i, o, m=max_seq_len: np.concatenate([[i], o[:-1]])[::m],
+                        look_back_state,
+                        state_outs,
+                    )
+                    if sa_episode.is_numpy
+                    else ([look_back_state] + state_outs[:-1])[::max_seq_len]
+                )
+                self.add_n_batch_items(
+                    batch=batch,
+                    column=Columns.STATE_IN,
+                    items_to_add=items_to_add,
+                    num_items=int(math.ceil(len(sa_episode) / max_seq_len)),
+                    single_agent_episode=sa_episode,
+                )
+                if Columns.NEXT_OBS in batch:
+                    items_to_add = (
+                        tree.map_structure(
+                            lambda i, m=max_seq_len: i[::m],
+                            state_outs,
+                        )
+                        if sa_episode.is_numpy
+                        else state_outs[::max_seq_len]
+                    )
+                    self.add_n_batch_items(
+                        batch=batch,
+                        column=Columns.NEXT_STATE_IN,
+                        items_to_add=items_to_add,
+                        num_items=int(math.ceil(len(sa_episode) / max_seq_len)),
+                        single_agent_episode=sa_episode,
+                    )
+
+            else:
+                assert not sa_episode.is_numpy
+
+                # Multi-agent case: Extract correct single agent RLModule (to get the
+                # state for individually).
+                sa_module = rl_module
+                if sa_episode.module_id is not None:
+                    sa_module = rl_module[sa_episode.module_id]
+                # This single-agent RLModule is NOT stateful -> Skip.
+                if not sa_module.is_stateful():
+                    continue
+
+                # Episode just started or has no `"state_out"` (e.g. in offline
+                # sampling) -> Get initial state from our RLModule.
+                if (sa_episode.t_started == 0 and len(sa_episode) == 0) or (
+                    Columns.STATE_OUT not in sa_episode.extra_model_outputs
+                ):
+                    state = sa_module.get_initial_state()
+                # Episode is already ongoing -> Use most recent STATE_OUT.
+                else:
+                    state = sa_episode.get_extra_model_outputs(
+                        key=Columns.STATE_OUT,
+                        indices=-1,
+                    )
+                self.add_batch_item(
+                    batch,
+                    Columns.STATE_IN,
+                    item_to_add=state,
+                    single_agent_episode=sa_episode,
+                )
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_time_dim_to_batch_and_zero_pad.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_time_dim_to_batch_and_zero_pad.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d47e46340639c5579499adfb056aef47e927eb0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/add_time_dim_to_batch_and_zero_pad.py
@@ -0,0 +1,302 @@
+from typing import Any, Dict, List, Optional
+
+import gymnasium as gym
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.postprocessing.zero_padding import (
+    create_mask_and_seq_lens,
+    split_and_zero_pad,
+)
+from ray.rllib.utils.spaces.space_utils import BatchedNdArray
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class AddTimeDimToBatchAndZeroPad(ConnectorV2):
+    """Adds an extra time dim (axis=1) to all data currently in the batch.
+
+    Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that
+    are added automatically by RLlib into every env-to-module/Learner connector
+    pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or
+    `config.add_default_connectors_to_learner_pipeline ` are set to
+    False.
+
+    The default env-to-module connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+    The default Learner connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddColumnsFromEpisodesToTrainBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+
+    If the RLModule is stateful, an extra time dim at axis=1 is added to all data in the
+    batch.
+
+    Also, all data (observations, rewards, etc.. if applicable) will be properly
+    reshaped into (B, T=max_seq_len (learner) or 1 (env-to-module), ...) and will be
+    zero-padded, if necessary.
+
+    This ConnectorV2:
+    - Operates on a list of Episode objects.
+    - Adds a time dim at axis=1 to all columns already in the batch.
+    - In case of a learner connector pipeline, zero-pads the data according to the
+    module's `self.model_config["max_seq_len"]` setting and reshapes all data to
+    (B, T, ...). The connector also adds SEQ_LENS information and loss mask
+    information to the batch based on the added zero-padding.
+    - Does NOT alter any data in the given episodes.
+    - Can be used in EnvToModule and Learner connector pipelines.
+
+    .. testcode::
+
+        from ray.rllib.connectors.common import AddTimeDimToBatchAndZeroPad
+        from ray.rllib.core.columns import Columns
+        from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+        from ray.rllib.utils.test_utils import check
+
+
+        # Create a simple dummy class, pretending to be an RLModule with
+        # `get_initial_state`, `is_stateful` and `model_config` property defined:
+        class MyStateModule:
+            # dummy config
+            model_config = {"max_seq_len": 3}
+
+            def is_stateful(self):
+                return True
+
+            def get_initial_state(self):
+                return 0.0
+
+
+        # Create an already reset episode. Expect the connector to add a time-dim to the
+        # reset observation.
+        episode = SingleAgentEpisode(observations=[0])
+
+        rl_module = MyStateModule()
+
+        # Create an instance of this class (as an env-to-module connector).
+        connector = AddTimeDimToBatchAndZeroPad(as_learner_connector=False)
+
+        # Call the connector.
+        output_batch = connector(
+            rl_module=rl_module,
+            batch={Columns.OBS: [0]},
+            episodes=[episode],
+            shared_data={},
+        )
+        # The output data's OBS key should now be reshaped to (B, T)
+        check(output_batch[Columns.OBS], [[0]])
+
+        # Create a SingleAgentEpisodes containing 5 observations,
+        # 4 actions and 4 rewards.
+        episode = SingleAgentEpisode(
+            observations=[0, 1, 2, 3, 4],
+            actions=[1, 2, 3, 4],
+            rewards=[1.0, 2.0, 3.0, 4.0],
+            len_lookback_buffer=0,
+        )
+
+        # Call the connector.
+        output_batch = connector(
+            rl_module=rl_module,
+            batch={Columns.OBS: [4]},
+            episodes=[episode],
+            shared_data={},
+        )
+        # The output data's OBS, ACTIONS, and REWARDS keys should now all have a time
+        # rank.
+        check(
+            # Expect the episode's last OBS.
+            output_batch[Columns.OBS], [[4]],
+        )
+
+        # Create a new connector as a learner connector with a RNN seq len of 4 (for
+        # testing purposes only). Passing the same data through this learner connector,
+        # we expect the data to also be zero-padded.
+        connector = AddTimeDimToBatchAndZeroPad(as_learner_connector=True)
+
+        # Call the connector.
+        output_batch = connector(
+            rl_module=rl_module,
+            batch={Columns.OBS: {(episode.id_,): [0, 1, 2, 3]}},
+            episodes=[episode],
+            shared_data={},
+        )
+        check(output_batch[Columns.OBS], {(episode.id_,): [[0, 1, 2], [3, 0, 0]]})
+    """
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        as_learner_connector: bool = False,
+        **kwargs,
+    ):
+        """Initializes a AddObservationsFromEpisodesToBatch instance.
+
+        Args:
+            as_learner_connector: Whether this connector is part of a Learner connector
+                pipeline, as opposed to a env-to-module pipeline. As a Learner
+                connector, it will add an entire Episode's observations (each timestep)
+                to the batch.
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            **kwargs,
+        )
+
+        self._as_learner_connector = as_learner_connector
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+
+        # If not stateful OR STATE_IN already in data, early out.
+        if not rl_module.is_stateful() or Columns.STATE_IN in batch:
+            return batch
+
+        # Make all inputs (other than STATE_IN) have an additional T-axis.
+        # Since data has not been batched yet (we are still operating on lists in the
+        # batch), we add this time axis as 0 (not 1). When we batch, the batch axis will
+        # be 0 and the time axis will be 1.
+        # Also, let module-to-env pipeline know that we had added a single timestep
+        # time rank to the data (to remove it again).
+        if not self._as_learner_connector:
+            for column in batch.keys():
+                self.foreach_batch_item_change_in_place(
+                    batch=batch,
+                    column=column,
+                    func=lambda item, eps_id, aid, mid: (
+                        item
+                        if mid is not None and not rl_module[mid].is_stateful()
+                        # Expand on axis 0 (the to-be-time-dim) if item has not been
+                        # batched yet, otherwise axis=1 (the time-dim).
+                        else tree.map_structure(
+                            lambda s: np.expand_dims(
+                                s, axis=(1 if isinstance(s, BatchedNdArray) else 0)
+                            ),
+                            item,
+                        )
+                    ),
+                )
+            shared_data["_added_single_ts_time_rank"] = True
+        else:
+            # Before adding STATE_IN to the `data`, zero-pad existing data and batch
+            # into max_seq_len chunks.
+            for column, column_data in batch.copy().items():
+                # Do not zero-pad INFOS column.
+                if column == Columns.INFOS:
+                    continue
+                for key, item_list in column_data.items():
+                    # Multi-agent case AND RLModule is not stateful -> Do not zero-pad
+                    # for this model.
+                    assert isinstance(key, tuple)
+                    mid = None
+                    if len(key) == 3:
+                        eps_id, aid, mid = key
+                        if not rl_module[mid].is_stateful():
+                            continue
+                    column_data[key] = split_and_zero_pad(
+                        item_list,
+                        max_seq_len=self._get_max_seq_len(rl_module, module_id=mid),
+                    )
+                    # TODO (sven): Remove this hint/hack once we are not relying on
+                    #  SampleBatch anymore (which has to set its property
+                    #  zero_padded=True when shuffling).
+                    shared_data[
+                        (
+                            "_zero_padded_for_mid="
+                            f"{mid if mid is not None else DEFAULT_MODULE_ID}"
+                        )
+                    ] = True
+
+            for sa_episode in self.single_agent_episode_iterator(
+                # If Learner connector, get all episodes (for train batch).
+                # If EnvToModule, get only those ongoing episodes that just had their
+                # agent step (b/c those are the ones we need to compute actions for next).
+                episodes,
+                agents_that_stepped_only=False,
+            ):
+                # Multi-agent case: Extract correct single agent RLModule (to get its
+                # individual state).
+                if sa_episode.module_id is not None:
+                    sa_module = rl_module[sa_episode.module_id]
+                else:
+                    sa_module = (
+                        rl_module[DEFAULT_MODULE_ID]
+                        if isinstance(rl_module, MultiRLModule)
+                        else rl_module
+                    )
+                # This single-agent RLModule is NOT stateful -> Skip.
+                if not sa_module.is_stateful():
+                    continue
+
+                max_seq_len = sa_module.model_config["max_seq_len"]
+
+                # Also, create the loss mask (b/c of our now possibly zero-padded data)
+                # as well as the seq_lens array and add these to `data` as well.
+                mask, seq_lens = create_mask_and_seq_lens(len(sa_episode), max_seq_len)
+                self.add_n_batch_items(
+                    batch=batch,
+                    column=Columns.SEQ_LENS,
+                    items_to_add=seq_lens,
+                    num_items=len(seq_lens),
+                    single_agent_episode=sa_episode,
+                )
+                if not shared_data.get("_added_loss_mask_for_valid_episode_ts"):
+                    self.add_n_batch_items(
+                        batch=batch,
+                        column=Columns.LOSS_MASK,
+                        items_to_add=mask,
+                        num_items=len(mask),
+                        single_agent_episode=sa_episode,
+                    )
+
+        return batch
+
+    def _get_max_seq_len(self, rl_module, module_id=None):
+        if not isinstance(rl_module, MultiRLModule):
+            mod = rl_module
+        elif module_id:
+            mod = rl_module[module_id]
+        else:
+            mod = next(iter(rl_module.values()))
+        if "max_seq_len" not in mod.model_config:
+            raise ValueError(
+                "You are using a stateful RLModule and are not providing a "
+                "'max_seq_len' key inside your `model_config`. You can set this "
+                "dict and/or override keys in it via `config.rl_module("
+                "model_config={'max_seq_len': [some int]})`."
+            )
+        return mod.model_config["max_seq_len"]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/agent_to_module_mapping.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/agent_to_module_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..63bd78cf9b783804aedb63563e5c5fa80f79de50
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/agent_to_module_mapping.py
@@ -0,0 +1,291 @@
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+
+import gymnasium as gym
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleSpec
+from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType, ModuleID
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class AgentToModuleMapping(ConnectorV2):
+    """ConnectorV2 that performs mapping of data from AgentID based to ModuleID based.
+
+    Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that
+    are added automatically by RLlib into every env-to-module/Learner connector
+    pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or
+    `config.add_default_connectors_to_learner_pipeline ` are set to
+    False.
+
+    The default env-to-module connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+    The default Learner connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddColumnsFromEpisodesToTrainBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+
+    This connector piece is only used by RLlib (as a default connector piece) in a
+    multi-agent setup.
+
+    Note that before the mapping, `data` is expected to have the following
+    structure:
+    [col0]:
+        (eps_id0, ag0, mod0): [list of individual batch items]
+        (eps_id0, ag1, mod2): [list of individual batch items]
+        (eps_id1, ag0, mod1): [list of individual batch items]
+    [col1]:
+        etc..
+
+    The target structure of the above `data` would then be:
+    [mod0]:
+        [col0]: [batched data -> batch_size_B will be the number of all items in the
+            input data under col0 that have mod0 as their ModuleID]
+        [col1]: [batched data]
+    [mod1]:
+        [col0]: etc.
+
+    Mapping happens in the following stages:
+
+    1) Under each column name, sort keys first by EpisodeID, then AgentID.
+    2) Add ModuleID keys under each column name (no cost/extra memory) and map these
+    new keys to empty lists.
+    [col0] -> [mod0] -> []: Then push items that belong to mod0 into these lists.
+    3) Perform batching on the per-module lists under each column:
+    [col0] -> [mod0]: [...] <- now batched data (numpy array or struct of numpy
+    arrays).
+    4) Flip column names with ModuleIDs (no cost/extra memory):
+    [mod0]:
+        [col0]: [batched data]
+    etc..
+
+    Note that in order to unmap the resulting batch back into an AgentID based one,
+    we have to store the env vector index AND AgentID of each module's batch item
+    in an additionally returned `memorized_map_structure`.
+
+    .. testcode::
+
+        from ray.rllib.connectors.env_to_module import AgentToModuleMapping
+        from ray.rllib.utils.test_utils import check
+
+        batch = {
+            "obs": {
+                ("MA-EPS0", "agent0", "module0"): [0, 1, 2],
+                ("MA-EPS0", "agent1", "module1"): [3, 4, 5],
+            },
+            "actions": {
+                ("MA-EPS1", "agent2", "module0"): [8],
+                ("MA-EPS0", "agent1", "module1"): [9],
+            },
+        }
+
+        # Create our connector piece.
+        connector = AgentToModuleMapping(
+            rl_module_specs={"module0", "module1"},
+            agent_to_module_mapping_fn=(
+                lambda agent_id, eps: "module1" if agent_id == "agent1" else "module0"
+            ),
+        )
+
+        # Call the connector (and thereby flip from AgentID based to ModuleID based
+        # structure..
+        output_batch = connector(
+            rl_module=None,  # This particular connector works without an RLModule.
+            batch=batch,
+            episodes=[],  # This particular connector works without a list of episodes.
+            explore=True,
+            shared_data={},
+        )
+
+        # `data` should now be mapped from ModuleIDs to module data.
+        check(
+            output_batch,
+            {
+                "module0": {
+                    "obs": [0, 1, 2],
+                    "actions": [8],
+                },
+                "module1": {
+                    "obs": [3, 4, 5],
+                    "actions": [9],
+                },
+            },
+        )
+    """
+
+    @override(ConnectorV2)
+    def recompute_output_observation_space(
+        self,
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+    ) -> gym.Space:
+        return self._map_space_if_necessary(input_observation_space, "obs")
+
+    @override(ConnectorV2)
+    def recompute_output_action_space(
+        self,
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+    ) -> gym.Space:
+        return self._map_space_if_necessary(input_action_space, "act")
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        rl_module_specs: Dict[ModuleID, RLModuleSpec],
+        agent_to_module_mapping_fn,
+    ):
+        super().__init__(input_observation_space, input_action_space)
+
+        self._rl_module_specs = rl_module_specs
+        self._agent_to_module_mapping_fn = agent_to_module_mapping_fn
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Current agent to module mapping function.
+        # agent_to_module_mapping_fn = shared_data.get("agent_to_module_mapping_fn")
+        # Store in shared data, which module IDs map to which episode/agent, such
+        # that the module-to-env pipeline can map the data back to agents.
+        memorized_map_structure = defaultdict(list)
+        for column, agent_data in batch.items():
+            if rl_module is not None and column in rl_module:
+                continue
+            for eps_id, agent_id, module_id in agent_data.keys():
+                memorized_map_structure[module_id].append((eps_id, agent_id))
+            # TODO (sven): We should check that all columns have the same struct.
+            break
+
+        shared_data["memorized_map_structure"] = dict(memorized_map_structure)
+
+        # Mapping from ModuleID to column data.
+        data_by_module = {}
+
+        # Iterating over each column in the original data:
+        for column, agent_data in batch.items():
+            if rl_module is not None and column in rl_module:
+                if column in data_by_module:
+                    data_by_module[column].update(agent_data)
+                else:
+                    data_by_module[column] = agent_data
+                continue
+            for (
+                eps_id,
+                agent_id,
+                module_id,
+            ), values_batch_or_list in agent_data.items():
+                assert isinstance(values_batch_or_list, list)
+                for value in values_batch_or_list:
+                    if module_id not in data_by_module:
+                        data_by_module[module_id] = {column: []}
+                    elif column not in data_by_module[module_id]:
+                        data_by_module[module_id][column] = []
+
+                    # Append the data.
+                    data_by_module[module_id][column].append(value)
+
+        return data_by_module
+
+    def _map_space_if_necessary(self, space: gym.Space, which: str = "obs"):
+        # Analyze input observation space to check, whether the user has already taken
+        # care of the agent to module mapping.
+        if set(self._rl_module_specs) == set(space.spaces.keys()):
+            return space
+
+        # We need to take care of agent to module mapping. Figure out the resulting
+        # observation space here.
+        dummy_eps = MultiAgentEpisode()
+
+        ret_space = {}
+        for module_id in self._rl_module_specs:
+            # Easy way out, user has provided space in the RLModule spec dict.
+            if (
+                isinstance(self._rl_module_specs, dict)
+                and module_id in self._rl_module_specs
+            ):
+                if (
+                    which == "obs"
+                    and self._rl_module_specs[module_id].observation_space
+                ):
+                    ret_space[module_id] = self._rl_module_specs[
+                        module_id
+                    ].observation_space
+                    continue
+                elif which == "act" and self._rl_module_specs[module_id].action_space:
+                    ret_space[module_id] = self._rl_module_specs[module_id].action_space
+                    continue
+
+            # Need to reverse map spaces (for the different agents) to certain
+            # module IDs (using a dummy MultiAgentEpisode).
+            one_space = next(iter(space.spaces.values()))
+            # If all obs spaces are the same anyway, just use the first
+            # single-agent space.
+            if all(s == one_space for s in space.spaces.values()):
+                ret_space[module_id] = one_space
+            # Otherwise, we have to compare the ModuleID with all possible
+            # AgentIDs and find the agent ID that matches.
+            else:
+                match_aid = None
+                one_agent_for_module_found = False
+                for aid in space.spaces.keys():
+                    # Match: Assign spaces for this agentID to the PolicyID.
+                    if self._agent_to_module_mapping_fn(aid, dummy_eps) == module_id:
+                        # Make sure, different agents that map to the same
+                        # policy don't have different spaces.
+                        if (
+                            module_id in ret_space
+                            and space[aid] != ret_space[module_id]
+                        ):
+                            raise ValueError(
+                                f"Two agents ({aid} and {match_aid}) in your "
+                                "environment map to the same ModuleID (as per your "
+                                "`agent_to_module_mapping_fn`), however, these agents "
+                                "also have different observation spaces as per the env!"
+                            )
+                        ret_space[module_id] = space[aid]
+                        match_aid = aid
+                        one_agent_for_module_found = True
+                # Still no space found for this module ID -> Error out.
+                if not one_agent_for_module_found:
+                    raise ValueError(
+                        f"Could not find or derive any {which}-space for RLModule "
+                        f"{module_id}! This can happen if your `config.rl_module(rl_"
+                        f"module_spec=...)` does NOT contain space information for this"
+                        " particular single-agent module AND your agent-to-module-"
+                        "mapping function is stochastic (such that for some agent A, "
+                        "more than one ModuleID might be returned somewhat randomly). "
+                        f"Fix this error by providing {which}-space information using "
+                        "`config.rl_module(rl_module_spec=MultiRLModuleSpec("
+                        f"rl_module_specs={{'{module_id}': RLModuleSpec("
+                        "observation_space=..., action_space=...)}}))"
+                    )
+
+        return gym.spaces.Dict(ret_space)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/batch_individual_items.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/batch_individual_items.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d3bd5a53cd7aa560d139145135f3a1f679b6238
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/batch_individual_items.py
@@ -0,0 +1,200 @@
+from typing import Any, Dict, List, Optional
+
+import gymnasium as gym
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.spaces.space_utils import batch as batch_fn
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class BatchIndividualItems(ConnectorV2):
+    """Batches individual data-items (in lists) into tensors (with batch dimension).
+
+    Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that
+    are added automatically by RLlib into every env-to-module/Learner connector
+    pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or
+    `config.add_default_connectors_to_learner_pipeline ` are set to
+    False.
+
+    The default env-to-module connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+    The default Learner connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddColumnsFromEpisodesToTrainBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+
+    This ConnectorV2:
+    - Operates only on the input `data`, NOT the incoming list of episode objects
+    (ignored).
+    - In the single-agent case, `data` must already be a dict, structured as follows by
+    prior connector pieces of the same pipeline:
+    [col0] -> {[(eps_id,)]: [list of individual batch items]}
+    - In the multi-agent case, `data` must already be a dict, structured as follows by
+    prior connector pieces of the same pipeline (in particular the
+    `AgentToModuleMapping` piece):
+    [module_id] -> [col0] -> [list of individual batch items]
+    - Translates the above data under the different columns (e.g. "obs") into final
+    (batched) structures. For the single-agent case, the output `data` looks like this:
+    [col0] -> [possibly complex struct of batches (at the leafs)].
+    For the multi-agent case, the output `data` looks like this:
+    [module_id] -> [col0] -> [possibly complex struct of batches (at the leafs)].
+
+    .. testcode::
+
+        from ray.rllib.connectors.common import BatchIndividualItems
+        from ray.rllib.utils.test_utils import check
+
+        single_agent_batch = {
+            "obs": {
+                # Note that at this stage, next-obs is not part of the data anymore ..
+                ("MA-EPS0",): [0, 1],
+                ("MA-EPS1",): [2, 3],
+            },
+            "actions": {
+                # .. so we have as many actions per episode as we have observations.
+                ("MA-EPS0",): [4, 5],
+                ("MA-EPS1",): [6, 7],
+            },
+        }
+
+        # Create our (single-agent) connector piece.
+        connector = BatchIndividualItems()
+
+        # Call the connector (and thereby batch the individual items).
+        output_batch = connector(
+            rl_module=None,  # This particular connector works without an RLModule.
+            batch=single_agent_batch,
+            episodes=[],  # This particular connector works without a list of episodes.
+            explore=True,
+            shared_data={},
+        )
+
+        # `output_batch` should now be batched (episode IDs should have been removed
+        # from the struct).
+        check(
+            output_batch,
+            {"obs": [0, 1, 2, 3], "actions": [4, 5, 6, 7]},
+        )
+    """
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        multi_agent: bool = False,
+        **kwargs,
+    ):
+        """Initializes a BatchIndividualItems instance.
+
+        Args:
+            multi_agent: Whether this is a connector operating on a multi-agent
+                observation space mapping AgentIDs to individual agents' observations.
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            **kwargs,
+        )
+        self._multi_agent = multi_agent
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        is_multi_rl_module = isinstance(rl_module, MultiRLModule)
+
+        # Convert lists of individual items into properly batched data.
+        for column, column_data in batch.copy().items():
+            # Multi-agent case: This connector piece should only be used after(!)
+            # the AgentToModuleMapping connector has already been applied, leading
+            # to a batch structure of:
+            # [module_id] -> [col0] -> [list of individual batch items]
+            if is_multi_rl_module and column in rl_module:
+                # Case, in which a column has already been properly batched before this
+                # connector piece is called.
+                if not self._multi_agent:
+                    continue
+                # If MA Off-Policy and independent sampling we need to overcome this
+                # check.
+                module_data = column_data
+                for col, col_data in module_data.copy().items():
+                    if isinstance(col_data, list) and col != Columns.INFOS:
+                        module_data[col] = batch_fn(
+                            col_data,
+                            individual_items_already_have_batch_dim="auto",
+                        )
+
+            # Simple case: There is a list directly under `column`:
+            # Batch the list.
+            elif isinstance(column_data, list):
+                batch[column] = batch_fn(
+                    column_data,
+                    individual_items_already_have_batch_dim="auto",
+                )
+
+            # Single-agent case: There is a dict under `column` mapping
+            # `eps_id` to lists of items:
+            # Concat all these lists, then batch.
+            elif not self._multi_agent:
+                # TODO: only really need this in non-Learner connector pipeline
+                memorized_map_structure = []
+                list_to_be_batched = []
+                for (eps_id,) in column_data.keys():
+                    for item in column_data[(eps_id,)]:
+                        # Only record structure for OBS column.
+                        if column == Columns.OBS:
+                            memorized_map_structure.append(eps_id)
+                        list_to_be_batched.append(item)
+                # INFOS should not be batched (remain a list).
+                batch[column] = (
+                    list_to_be_batched
+                    if column == Columns.INFOS
+                    else batch_fn(
+                        list_to_be_batched,
+                        individual_items_already_have_batch_dim="auto",
+                    )
+                )
+                if is_multi_rl_module:
+                    if DEFAULT_MODULE_ID not in batch:
+                        batch[DEFAULT_MODULE_ID] = {}
+                    batch[DEFAULT_MODULE_ID][column] = batch.pop(column)
+
+                # Only record structure for OBS column.
+                if column == Columns.OBS:
+                    shared_data["memorized_map_structure"] = memorized_map_structure
+            # Multi-agent case: But Module ID not found in our RLModule -> Ignore this
+            # `module_id` entirely.
+            # else:
+            #    pass
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/frame_stacking.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/frame_stacking.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a911a5e8ea10b3053822583cf9035bb4f1a9802
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/frame_stacking.py
@@ -0,0 +1,147 @@
+import numpy as np
+from typing import Any, Dict, List, Optional
+
+import gymnasium as gym
+import tree  # pip install dm_tree
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class _FrameStacking(ConnectorV2):
+    """A connector piece that stacks the previous n observations into one."""
+
+    @override(ConnectorV2)
+    def recompute_output_observation_space(
+        self,
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+    ) -> gym.Space:
+        # Change our observation space according to the given stacking settings.
+        if self._multi_agent:
+            ret = {}
+            for agent_id, obs_space in input_observation_space.spaces.items():
+                ret[agent_id] = self._convert_individual_space(obs_space)
+            return gym.spaces.Dict(ret)
+        else:
+            return self._convert_individual_space(input_observation_space)
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        num_frames: int = 1,
+        multi_agent: bool = False,
+        as_learner_connector: bool = False,
+        **kwargs,
+    ):
+        """Initializes a _FrameStackingConnector instance.
+
+        Args:
+            num_frames: The number of observation frames to stack up (into a single
+                observation) for the RLModule's forward pass.
+            multi_agent: Whether this is a connector operating on a multi-agent
+                observation space mapping AgentIDs to individual agents' observations.
+            as_learner_connector: Whether this connector is part of a Learner connector
+                pipeline, as opposed to an env-to-module pipeline.
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            **kwargs,
+        )
+
+        self._multi_agent = multi_agent
+        self.num_frames = num_frames
+        self._as_learner_connector = as_learner_connector
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Learner connector pipeline. Episodes have been numpy'ized.
+        if self._as_learner_connector:
+            for sa_episode in self.single_agent_episode_iterator(
+                episodes, agents_that_stepped_only=False
+            ):
+
+                def _map_fn(s, _sa_episode=sa_episode):
+                    # Squeeze out last dim.
+                    s = np.squeeze(s, axis=-1)
+                    # Calculate new shape and strides
+                    new_shape = (len(_sa_episode), self.num_frames) + s.shape[1:]
+                    new_strides = (s.strides[0],) + s.strides
+                    # Create a strided view of the array.
+                    return np.transpose(
+                        np.lib.stride_tricks.as_strided(
+                            s, shape=new_shape, strides=new_strides
+                        ),
+                        axes=[0, 2, 3, 1],
+                    )
+
+                # Get all observations from the episode in one np array (except for
+                # the very last one, which is the final observation not needed for
+                # learning).
+                self.add_n_batch_items(
+                    batch=batch,
+                    column=Columns.OBS,
+                    items_to_add=tree.map_structure(
+                        _map_fn,
+                        sa_episode.get_observations(
+                            indices=slice(-self.num_frames + 1, len(sa_episode)),
+                            neg_index_as_lookback=True,
+                            fill=0.0,
+                        ),
+                    ),
+                    num_items=len(sa_episode),
+                    single_agent_episode=sa_episode,
+                )
+
+        # Env-to-module pipeline. Episodes still operate on lists.
+        else:
+            for sa_episode in self.single_agent_episode_iterator(episodes):
+                assert not sa_episode.is_numpy
+                # Get the list of observations to stack.
+                obs_stack = sa_episode.get_observations(
+                    indices=slice(-self.num_frames, None),
+                    fill=0.0,
+                )
+                # Observation components are (w, h, 1)
+                # -> concatenate along axis=-1 to (w, h, [num_frames]).
+                stacked_obs = tree.map_structure(
+                    lambda *s: np.concatenate(s, axis=2),
+                    *obs_stack,
+                )
+                self.add_batch_item(
+                    batch=batch,
+                    column=Columns.OBS,
+                    item_to_add=stacked_obs,
+                    single_agent_episode=sa_episode,
+                )
+
+        return batch
+
+    def _convert_individual_space(self, obs_space):
+        # Some assumptions: Space is box AND last dim (the stacking one) is 1.
+        assert isinstance(obs_space, gym.spaces.Box), obs_space
+        assert obs_space.shape[-1] == 1, obs_space
+
+        return gym.spaces.Box(
+            low=np.repeat(obs_space.low, repeats=self.num_frames, axis=-1),
+            high=np.repeat(obs_space.high, repeats=self.num_frames, axis=-1),
+            shape=list(obs_space.shape)[:-1] + [self.num_frames],
+            dtype=obs_space.dtype,
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/module_to_agent_unmapping.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/module_to_agent_unmapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb3a2b1e954e98ee89a455c53d70304d2cd9b7f0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/module_to_agent_unmapping.py
@@ -0,0 +1,48 @@
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class ModuleToAgentUnmapping(ConnectorV2):
+    """Performs flipping of `data` from ModuleID- to AgentID based mapping.
+
+    Before mapping:
+    data[module1] -> [col, e.g. ACTIONS]
+    -> [dict mapping episode-identifying tuples to lists of data]
+    data[module2] -> ...
+
+    After mapping:
+    data[ACTIONS]: [dict mapping episode-identifying tuples to lists of data]
+
+    Note that episode-identifying tuples have the form of: (episode_id,) in the
+    single-agent case and (ma_episode_id, agent_id, module_id) in the multi-agent
+    case.
+    """
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # This Connector should only be used in a multi-agent setting.
+        assert isinstance(episodes[0], MultiAgentEpisode)
+
+        agent_data = defaultdict(dict)
+        for module_id, module_data in batch.items():
+            for column, values_dict in module_data.items():
+                agent_data[column].update(values_dict)
+
+        return dict(agent_data)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/numpy_to_tensor.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/numpy_to_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c0123c44990ec52e991004c259f32070ce05e73
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/numpy_to_tensor.py
@@ -0,0 +1,125 @@
+from typing import Any, Dict, List, Optional
+
+import gymnasium as gym
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class NumpyToTensor(ConnectorV2):
+    """Converts numpy arrays across the entire input data into (framework) tensors.
+
+    The framework information is received via the provided `rl_module` arg in the
+    `__call__()` method.
+
+    Note: This is one of the default env-to-module or Learner ConnectorV2 pieces that
+    are added automatically by RLlib into every env-to-module/Learner connector
+    pipeline, unless `config.add_default_connectors_to_env_to_module_pipeline` or
+    `config.add_default_connectors_to_learner_pipeline ` are set to
+    False.
+
+    The default env-to-module connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+    The default Learner connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddColumnsFromEpisodesToTrainBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+
+    This ConnectorV2:
+    - Loops through the input `data` and converts all found numpy arrays into
+    framework-specific tensors (possibly on a GPU).
+    """
+
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        as_learner_connector: bool = False,
+        pin_mempory: Optional[bool] = None,
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        """Initializes a NumpyToTensor instance.
+
+        Args:
+            as_learner_connector: Whether this ConnectorV2 piece is used inside a
+                LearnerConnectorPipeline or not.
+            pin_mempory: Whether to pin memory when creating (torch) tensors.
+                If None (default), pins memory if `as_learner_connector` is True,
+                otherwise doesn't pin memory.
+            device: An optional device to move the resulting tensors to. If not
+                provided, all data will be left on the CPU.
+            **kwargs:
+        """
+        super().__init__(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            **kwargs,
+        )
+        self._as_learner_connector = as_learner_connector
+        self._pin_memory = (
+            pin_mempory if pin_mempory is not None else self._as_learner_connector
+        )
+        self._device = device
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        is_single_agent = False
+        is_multi_rl_module = isinstance(rl_module, MultiRLModule)
+        # `data` already a ModuleID to batch mapping format.
+        if not (is_multi_rl_module and all(c in rl_module._rl_modules for c in batch)):
+            is_single_agent = True
+            batch = {DEFAULT_MODULE_ID: batch}
+
+        for module_id, module_data in batch.copy().items():
+            infos = module_data.pop(Columns.INFOS, None)
+            if rl_module.framework == "torch":
+                module_data = convert_to_torch_tensor(
+                    module_data, pin_memory=self._pin_memory, device=self._device
+                )
+            else:
+                raise ValueError(
+                    "`NumpyToTensor`does NOT support frameworks other than torch!"
+                )
+            if infos is not None:
+                module_data[Columns.INFOS] = infos
+            # Early out with data under(!) `DEFAULT_MODULE_ID`, b/c we are in plain
+            # single-agent mode.
+            if is_single_agent:
+                return module_data
+            batch[module_id] = module_data
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/tensor_to_numpy.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/tensor_to_numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6bbb2669c7942087e2e9fc1817a9dd3d48e1280
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/common/tensor_to_numpy.py
@@ -0,0 +1,26 @@
+from typing import Any, Dict, List, Optional
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class TensorToNumpy(ConnectorV2):
+    """Converts (framework) tensors across the entire input data into numpy arrays."""
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        return convert_to_numpy(batch)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..715ac1ffbb62e4e943c15e8f782d16c1f812f7f6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/env_to_module_pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/env_to_module_pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e50b62079f6d25e852b7534c8f92c03d5852b986
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/env_to_module_pipeline.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/flatten_observations.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/flatten_observations.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7462114c7996818b1e097a7d652225b1c1e48ed6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/flatten_observations.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/frame_stacking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/frame_stacking.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..461f918b04ff0006d7ead473a0805eb90a087d3b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/frame_stacking.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/observation_preprocessor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/observation_preprocessor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..123cb6365fea6080141f671848a4ef62e844e83c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/observation_preprocessor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/write_observations_to_episodes.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/write_observations_to_episodes.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd1cfa696c9d1e6d41219d61cd44ec3249b54048
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/env_to_module/__pycache__/write_observations_to_episodes.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_one_ts_to_episodes_and_truncate.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_one_ts_to_episodes_and_truncate.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9882468e2de537e0fd446fb838443e4261b98348
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/add_one_ts_to_episodes_and_truncate.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/frame_stacking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/frame_stacking.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f2d3bf38e10580e0b5a97b2f53ebd3745a7c5f8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/frame_stacking.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/learner_connector_pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/learner_connector_pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bde8e25e06cbd9f44cf6fbbb11bccc61b5086b65
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/__pycache__/learner_connector_pipeline.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_columns_from_episodes_to_train_batch.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_columns_from_episodes_to_train_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..6601b3b2011df21b870234e14f7654005bc9722d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_columns_from_episodes_to_train_batch.py
@@ -0,0 +1,166 @@
+from typing import Any, Dict, List, Optional
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class AddColumnsFromEpisodesToTrainBatch(ConnectorV2):
+    """Adds infos/actions/rewards/terminateds/... to train batch.
+
+    Note: This is one of the default Learner ConnectorV2 pieces that are added
+    automatically by RLlib into every Learner connector pipeline, unless
+    `config.add_default_connectors_to_learner_pipeline` is set to False.
+
+    The default Learner connector pipeline is:
+    [
+        [0 or more user defined ConnectorV2 pieces],
+        AddObservationsFromEpisodesToBatch,
+        AddColumnsFromEpisodesToTrainBatch,
+        AddTimeDimToBatchAndZeroPad,
+        AddStatesFromEpisodesToBatch,
+        AgentToModuleMapping,  # only in multi-agent setups!
+        BatchIndividualItems,
+        NumpyToTensor,
+    ]
+
+    Does NOT add observations to train batch (these should have already been added
+    by another ConnectorV2 piece: `AddObservationsToTrainBatch` in the same pipeline).
+
+    If provided with `episodes` data, this connector piece makes sure that the final
+    train batch going into the RLModule for updating (`forward_train()` call) contains
+    at the minimum:
+    - Observations: From all episodes under the Columns.OBS key.
+    - Actions, rewards, terminal/truncation flags: From all episodes under the
+    respective keys.
+    - All data inside the episodes' `extra_model_outs` property, e.g. action logp and
+    action probs under the respective keys.
+    - Internal states: These will NOT be added to the batch by this connector piece
+    as this functionality is handled by a different default connector piece:
+    `AddStatesFromEpisodesToBatch`.
+
+    If the user wants to customize their own data under the given keys (e.g. obs,
+    actions, ...), they can extract from the episodes or recompute from `data`
+    their own data and store it in `data` under those keys. In this case, the default
+    connector will not change the data under these keys and simply act as a
+    pass-through.
+    """
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Optional[Dict[str, Any]],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Infos.
+        if Columns.INFOS not in batch:
+            for sa_episode in self.single_agent_episode_iterator(
+                episodes,
+                agents_that_stepped_only=False,
+            ):
+                self.add_n_batch_items(
+                    batch,
+                    Columns.INFOS,
+                    items_to_add=sa_episode.get_infos(slice(0, len(sa_episode))),
+                    num_items=len(sa_episode),
+                    single_agent_episode=sa_episode,
+                )
+
+        # Actions.
+        if Columns.ACTIONS not in batch:
+            for sa_episode in self.single_agent_episode_iterator(
+                episodes,
+                agents_that_stepped_only=False,
+            ):
+                self.add_n_batch_items(
+                    batch,
+                    Columns.ACTIONS,
+                    items_to_add=[
+                        sa_episode.get_actions(indices=ts)
+                        for ts in range(len(sa_episode))
+                    ],
+                    num_items=len(sa_episode),
+                    single_agent_episode=sa_episode,
+                )
+        # Rewards.
+        if Columns.REWARDS not in batch:
+            for sa_episode in self.single_agent_episode_iterator(
+                episodes,
+                agents_that_stepped_only=False,
+            ):
+                self.add_n_batch_items(
+                    batch,
+                    Columns.REWARDS,
+                    items_to_add=[
+                        sa_episode.get_rewards(indices=ts)
+                        for ts in range(len(sa_episode))
+                    ],
+                    num_items=len(sa_episode),
+                    single_agent_episode=sa_episode,
+                )
+        # Terminateds.
+        if Columns.TERMINATEDS not in batch:
+            for sa_episode in self.single_agent_episode_iterator(
+                episodes,
+                agents_that_stepped_only=False,
+            ):
+                self.add_n_batch_items(
+                    batch,
+                    Columns.TERMINATEDS,
+                    items_to_add=(
+                        [False] * (len(sa_episode) - 1) + [sa_episode.is_terminated]
+                        if len(sa_episode) > 0
+                        else []
+                    ),
+                    num_items=len(sa_episode),
+                    single_agent_episode=sa_episode,
+                )
+        # Truncateds.
+        if Columns.TRUNCATEDS not in batch:
+            for sa_episode in self.single_agent_episode_iterator(
+                episodes,
+                agents_that_stepped_only=False,
+            ):
+                self.add_n_batch_items(
+                    batch,
+                    Columns.TRUNCATEDS,
+                    items_to_add=(
+                        [False] * (len(sa_episode) - 1) + [sa_episode.is_truncated]
+                        if len(sa_episode) > 0
+                        else []
+                    ),
+                    num_items=len(sa_episode),
+                    single_agent_episode=sa_episode,
+                )
+        # Extra model outputs (except for STATE_OUT, which will be handled by another
+        # default connector piece). Also, like with all the fields above, skip
+        # those that the user already seemed to have populated via custom connector
+        # pieces.
+        skip_columns = set(batch.keys()) | {Columns.STATE_IN, Columns.STATE_OUT}
+        for sa_episode in self.single_agent_episode_iterator(
+            episodes,
+            agents_that_stepped_only=False,
+        ):
+            for column in sa_episode.extra_model_outputs.keys():
+                if column not in skip_columns:
+                    self.add_n_batch_items(
+                        batch,
+                        column,
+                        items_to_add=[
+                            sa_episode.get_extra_model_outputs(key=column, indices=ts)
+                            for ts in range(len(sa_episode))
+                        ],
+                        num_items=len(sa_episode),
+                        single_agent_episode=sa_episode,
+                    )
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_next_observations_from_episodes_to_train_batch.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_next_observations_from_episodes_to_train_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..6efa3b706bf1f24e61f9791b273e7f11b08b5066
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_next_observations_from_episodes_to_train_batch.py
@@ -0,0 +1,103 @@
+from typing import Any, Dict, List, Optional
+
+from ray.rllib.core.columns import Columns
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class AddNextObservationsFromEpisodesToTrainBatch(ConnectorV2):
+    """Adds the NEXT_OBS column with the correct episode observations to train batch.
+
+    - Operates on a list of Episode objects.
+    - Gets all observation(s) from all the given episodes (except the very first ones)
+    and adds them to the batch under construction in the NEXT_OBS column (as a list of
+    individual observations).
+    - Does NOT alter any observations (or other data) in the given episodes.
+    - Can be used in Learner connector pipelines.
+
+    .. testcode::
+
+        import gymnasium as gym
+        import numpy as np
+
+        from ray.rllib.connectors.learner import (
+            AddNextObservationsFromEpisodesToTrainBatch
+        )
+        from ray.rllib.core.columns import Columns
+        from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+        from ray.rllib.utils.test_utils import check
+
+        # Create two dummy SingleAgentEpisodes, each containing 3 observations,
+        # 2 actions and 2 rewards (both episodes are length=2).
+        obs_space = gym.spaces.Box(-1.0, 1.0, (2,), np.float32)
+        act_space = gym.spaces.Discrete(2)
+
+        episodes = [SingleAgentEpisode(
+            observations=[obs_space.sample(), obs_space.sample(), obs_space.sample()],
+            actions=[act_space.sample(), act_space.sample()],
+            rewards=[1.0, 2.0],
+            len_lookback_buffer=0,
+        ) for _ in range(2)]
+        eps_1_next_obses = episodes[0].get_observations([1, 2])
+        eps_2_next_obses = episodes[1].get_observations([1, 2])
+        print(f"1st Episode's next obses are {eps_1_next_obses}")
+        print(f"2nd Episode's next obses are {eps_2_next_obses}")
+
+        # Create an instance of this class.
+        connector = AddNextObservationsFromEpisodesToTrainBatch()
+
+        # Call the connector with the two created episodes.
+        # Note that this particular connector works without an RLModule, so we
+        # simplify here for the sake of this example.
+        output_data = connector(
+            rl_module=None,
+            batch={},
+            episodes=episodes,
+            explore=True,
+            shared_data={},
+        )
+        # The output data should now contain the last observations of both episodes,
+        # in a "per-episode organized" fashion.
+        check(
+            output_data,
+            {
+                Columns.NEXT_OBS: {
+                    (episodes[0].id_,): eps_1_next_obses,
+                    (episodes[1].id_,): eps_2_next_obses,
+                },
+            },
+        )
+    """
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # If "obs" already in `batch`, early out.
+        if Columns.NEXT_OBS in batch:
+            return batch
+
+        for sa_episode in self.single_agent_episode_iterator(
+            # This is a Learner-only connector -> Get all episodes (for train batch).
+            episodes,
+            agents_that_stepped_only=False,
+        ):
+            self.add_n_batch_items(
+                batch,
+                Columns.NEXT_OBS,
+                items_to_add=sa_episode.get_observations(slice(1, len(sa_episode) + 1)),
+                num_items=len(sa_episode),
+                single_agent_episode=sa_episode,
+            )
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcd3703eeb855e1ab1fad584c1aa01b5f922f5cb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py
@@ -0,0 +1,168 @@
+from typing import Any, Dict, List, Optional
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.postprocessing.episodes import add_one_ts_to_episodes_and_truncate
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class AddOneTsToEpisodesAndTruncate(ConnectorV2):
+    """Adds an artificial timestep to all incoming episodes at the end.
+
+    In detail: The last observations, infos, actions, and all `extra_model_outputs`
+    will be duplicated and appended to each episode's data. An extra 0.0 reward
+    will be appended to the episode's rewards. The episode's timestep will be
+    increased by 1. Also, adds the truncated=True flag to each episode if the
+    episode is not already done (terminated or truncated).
+
+    Useful for value function bootstrapping, where it is required to compute a
+    forward pass for the very last timestep within the episode,
+    i.e. using the following input dict: {
+      obs=[final obs],
+      state=[final state output],
+      prev. reward=[final reward],
+      etc..
+    }
+
+    .. testcode::
+
+        from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate
+        from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+        from ray.rllib.utils.test_utils import check
+
+        # Create 2 episodes (both to be extended by one timestep).
+        episode1 = SingleAgentEpisode(
+            observations=[0, 1, 2],
+            actions=[0, 1],
+            rewards=[0.0, 1.0],
+            terminated=False,
+            truncated=False,
+            len_lookback_buffer=0,
+        ).to_numpy()
+        check(len(episode1), 2)
+        check(episode1.is_truncated, False)
+
+        episode2 = SingleAgentEpisode(
+            observations=[0, 1, 2, 3, 4, 5],
+            actions=[0, 1, 2, 3, 4],
+            rewards=[0.0, 1.0, 2.0, 3.0, 4.0],
+            terminated=True,  # a terminated episode
+            truncated=False,
+            len_lookback_buffer=0,
+        ).to_numpy()
+        check(len(episode2), 5)
+        check(episode2.is_truncated, False)
+        check(episode2.is_terminated, True)
+
+        # Create an instance of this class.
+        connector = AddOneTsToEpisodesAndTruncate()
+
+        # Call the connector.
+        shared_data = {}
+        _ = connector(
+            rl_module=None,  # Connector used here does not require RLModule.
+            batch={},
+            episodes=[episode1, episode2],
+            shared_data=shared_data,
+        )
+        # Check on the episodes. Both of them should now be 1 timestep longer.
+        check(len(episode1), 3)
+        check(episode1.is_truncated, True)
+        check(len(episode2), 6)
+        check(episode2.is_truncated, False)
+        check(episode2.is_terminated, True)
+    """
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Dict[str, Any],
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Build the loss mask to make sure the extra added timesteps do not influence
+        # the final loss and fix the terminateds and truncateds in the batch.
+
+        # For proper v-trace execution, the rules must be as follows:
+        # Legend:
+        # T: terminal=True
+        # R: truncated=True
+        # B0: bootstrap with value 0 (also: terminal=True)
+        # Bx: bootstrap with some vf-computed value (also: terminal=True)
+
+        # batch: - - - - - - - T B0- - - - - R Bx- - - - R Bx
+        # mask : t t t t t t t t f t t t t t t f t t t t t f
+
+        # TODO (sven): Same situation as in TODO below, but for multi-agent episode.
+        #  Maybe add a dedicated connector piece for this task?
+        # We extend the MultiAgentEpisode's ID by a running number here to make sure
+        # we treat each MAEpisode chunk as separate (for potentially upcoming v-trace
+        # and LSTM zero-padding) and don't mix data from different chunks.
+        if isinstance(episodes[0], MultiAgentEpisode):
+            for i, ma_episode in enumerate(episodes):
+                ma_episode.id_ += "_" + str(i)
+                # Also change the underlying single-agent episode's
+                # `multi_agent_episode_id` properties.
+                for sa_episode in ma_episode.agent_episodes.values():
+                    sa_episode.multi_agent_episode_id = ma_episode.id_
+
+        for i, sa_episode in enumerate(
+            self.single_agent_episode_iterator(episodes, agents_that_stepped_only=False)
+        ):
+            # TODO (sven): This is a little bit of a hack: By extending the Episode's
+            #  ID, we make sure that each episode chunk in `episodes` is treated as a
+            #  separate episode in the `self.add_n_batch_items` below. Some algos (e.g.
+            #  APPO) may have >1 episode chunks from the same episode (same ID) in the
+            #  training data, thus leading to a malformatted batch in case of
+            #  RNN-triggered zero-padding of the train batch.
+            #  For example, if e1 (id=a len=4) and e2 (id=a len=5) are two chunks of the
+            #  same episode in `episodes`, the resulting batch would have an additional
+            #  timestep in the middle of the episode's "row":
+            #  {  "obs": {
+            #    ("a", <- eps ID): [0, 1, 2, 3 <- len=4, [additional 1 ts (bad)],
+            #                       0, 1, 2, 3, 4 <- len=5, [additional 1 ts]]
+            #  }}
+            sa_episode.id_ += "_" + str(i)
+
+            len_ = len(sa_episode)
+
+            # Extend all episodes by one ts.
+            add_one_ts_to_episodes_and_truncate([sa_episode])
+
+            loss_mask = [True for _ in range(len_)] + [False]
+            self.add_n_batch_items(
+                batch,
+                Columns.LOSS_MASK,
+                loss_mask,
+                len_ + 1,
+                sa_episode,
+            )
+
+            terminateds = (
+                [False for _ in range(len_ - 1)]
+                + [bool(sa_episode.is_terminated)]
+                + [True]  # extra timestep
+            )
+            self.add_n_batch_items(
+                batch,
+                Columns.TERMINATEDS,
+                terminateds,
+                len_ + 1,
+                sa_episode,
+            )
+
+        # Signal to following connector pieces that the loss-mask which masks out
+        # invalid episode ts (for the extra added ts at the end) has already been
+        # added to `data`.
+        shared_data["_added_loss_mask_for_valid_episode_ts"] = True
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/compute_returns_to_go.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/compute_returns_to_go.py
new file mode 100644
index 0000000000000000000000000000000000000000..d005b8c5accbc7eb80da11885d847e2c3a950584
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/compute_returns_to_go.py
@@ -0,0 +1,68 @@
+from typing import Any, List, Dict
+
+import scipy
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.utils.typing import EpisodeType
+
+
+class ComputeReturnsToGo(ConnectorV2):
+    """Learner ConnectorV2 piece computing discounted returns to go till end of episode.
+
+    This ConnectorV2:
+    - Operates on a list of Episode objects (single- or multi-agent).
+    - Should be used only in the Learner pipeline as a preparation for an upcoming loss
+    computation that requires the discounted returns to go (until the end of the
+    episode).
+    - For each agent, for each episode and at each timestep, sums up the rewards
+    (discounted) until the end of the episode and assigns the results to a new
+    column: RETURNS_TO_GO in the batch.
+    """
+
+    def __init__(
+        self,
+        input_observation_space=None,
+        input_action_space=None,
+        *,
+        gamma,
+    ):
+        """Initializes a ComputeReturnsToGo instance.
+
+        Args:
+            gamma: The discount factor gamma.
+        """
+        super().__init__(input_observation_space, input_action_space)
+        self.gamma = gamma
+
+    def __call__(
+        self,
+        *,
+        rl_module: MultiRLModule,
+        episodes: List[EpisodeType],
+        batch: Dict[str, Any],
+        **kwargs,
+    ):
+        for sa_episode in self.single_agent_episode_iterator(
+            episodes, agents_that_stepped_only=False
+        ):
+            # Reverse the rewards sequence.
+            rewards_reversed = sa_episode.get_rewards()[::-1]
+            # Use lfilter to compute the discounted cumulative sums.
+            discounted_cumsum_reversed = scipy.signal.lfilter(
+                [1], [1, -self.gamma], rewards_reversed
+            )
+            # Reverse the result to get the correct order.
+            discounted_returns = discounted_cumsum_reversed[::-1]
+
+            # Add the results to the batch under a new column: RETURNS_TO_GO.
+            self.add_n_batch_items(
+                batch=batch,
+                column=Columns.RETURNS_TO_GO,
+                items_to_add=discounted_returns,
+                num_items=len(sa_episode),
+                single_agent_episode=sa_episode,
+            )
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/frame_stacking.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/frame_stacking.py
new file mode 100644
index 0000000000000000000000000000000000000000..648c7146fc5f9f8d568065240a34387f04c67f81
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/frame_stacking.py
@@ -0,0 +1,6 @@
+from functools import partial
+
+from ray.rllib.connectors.common.frame_stacking import _FrameStacking
+
+
+FrameStackingLearner = partial(_FrameStacking, as_learner_connector=True)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/general_advantage_estimation.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/general_advantage_estimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf99887328cf0b090680d0770d3d824d4bb9aed1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/general_advantage_estimation.py
@@ -0,0 +1,196 @@
+from typing import Any, List, Dict
+
+import numpy as np
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.connectors.common.numpy_to_tensor import NumpyToTensor
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.postprocessing.value_predictions import compute_value_targets
+from ray.rllib.utils.postprocessing.zero_padding import (
+    split_and_zero_pad_n_episodes,
+    unpad_data_if_necessary,
+)
+from ray.rllib.utils.typing import EpisodeType
+
+
+class GeneralAdvantageEstimation(ConnectorV2):
+    """Learner ConnectorV2 piece computing GAE advantages and value targets on episodes.
+
+    This ConnectorV2:
+    - Operates on a list of Episode objects (single- or multi-agent).
+    - Should be used only in the Learner pipeline and as one of the last pieces (due
+    to the fact that it requires the batch for the value functions to be already
+    complete).
+    - Requires the incoming episodes to already be elongated by one artificial timestep
+    at the end  (last obs, actions, states, etc.. repeated, last reward=0.0, etc..),
+    making it possible to combine the per-timestep value computations with the
+    necessary "bootstrap" value computations at the episode (chunk) truncation points.
+    The extra timestep should be added using the `ray.rllib.connectors.learner.
+    add_one_ts_to_episodes_and_truncate.AddOneTsToEpisodesAndTruncate` connector piece.
+
+    The GAE computation is performed in an efficient way through using the arriving
+    `batch` as forward batch for the value function, extracting the bootstrap values
+    (at the artificially added time steos) and all other value predictions (all other
+    timesteps), performing GAE, and adding the results back into `batch` (under
+    Postprocessing.ADVANTAGES and Postprocessing.VALUE_TARGETS.
+    """
+
+    def __init__(
+        self,
+        input_observation_space=None,
+        input_action_space=None,
+        *,
+        gamma,
+        lambda_,
+    ):
+        """Initializes a GeneralAdvantageEstimation instance.
+
+        Args:
+            gamma: The discount factor gamma.
+            lambda_: The lambda parameter for General Advantage Estimation (GAE).
+                Defines the exponential weight used between actually measured rewards
+                vs value function estimates over multiple time steps. Specifically,
+                `lambda_` balances short-term, low-variance estimates with longer-term,
+                high-variance returns. A `lambda_` or 0.0 makes the GAE rely only on
+                immediate rewards (and vf predictions from there on, reducing variance,
+                but increasing bias), while a `lambda_` of 1.0 only incorporates vf
+                predictions at the truncation points of the given episodes or episode
+                chunks (reducing bias but increasing variance).
+        """
+        super().__init__(input_observation_space, input_action_space)
+        self.gamma = gamma
+        self.lambda_ = lambda_
+
+        # Internal numpy-to-tensor connector to translate GAE results (advantages and
+        # vf targets) into tensors.
+        self._numpy_to_tensor_connector = None
+
+    @override(ConnectorV2)
+    def __call__(
+        self,
+        *,
+        rl_module: MultiRLModule,
+        episodes: List[EpisodeType],
+        batch: Dict[str, Any],
+        **kwargs,
+    ):
+        # Device to place all GAE result tensors (advantages and value targets) on.
+        device = None
+
+        # Extract all single-agent episodes.
+        sa_episodes_list = list(
+            self.single_agent_episode_iterator(episodes, agents_that_stepped_only=False)
+        )
+        # Perform the value nets' forward passes.
+        # TODO (sven): We need to check here in the pipeline already, whether a module
+        #  should even be updated or not (which we usually do after(!) the Learner
+        #  pipeline). This is an open TODO to move this filter into a connector as well.
+        #  For now, we'll just check, whether `mid` is in batch and skip if it isn't.
+        vf_preds = rl_module.foreach_module(
+            func=lambda mid, module: (
+                module.compute_values(batch[mid])
+                if mid in batch and isinstance(module, ValueFunctionAPI)
+                else None
+            ),
+            return_dict=True,
+        )
+        # Loop through all modules and perform each one's GAE computation.
+        for module_id, module_vf_preds in vf_preds.items():
+            # Skip those outputs of RLModules that are not implementers of
+            # `ValueFunctionAPI`.
+            if module_vf_preds is None:
+                continue
+
+            module = rl_module[module_id]
+            device = module_vf_preds.device
+            # Convert to numpy for the upcoming GAE computations.
+            module_vf_preds = convert_to_numpy(module_vf_preds)
+
+            # Collect (single-agent) episode lengths for this particular module.
+            episode_lens = [
+                len(e) for e in sa_episodes_list if e.module_id in [None, module_id]
+            ]
+
+            # Remove all zero-padding again, if applicable, for the upcoming
+            # GAE computations.
+            module_vf_preds = unpad_data_if_necessary(episode_lens, module_vf_preds)
+            # Compute value targets.
+            module_value_targets = compute_value_targets(
+                values=module_vf_preds,
+                rewards=unpad_data_if_necessary(
+                    episode_lens,
+                    convert_to_numpy(batch[module_id][Columns.REWARDS]),
+                ),
+                terminateds=unpad_data_if_necessary(
+                    episode_lens,
+                    convert_to_numpy(batch[module_id][Columns.TERMINATEDS]),
+                ),
+                truncateds=unpad_data_if_necessary(
+                    episode_lens,
+                    convert_to_numpy(batch[module_id][Columns.TRUNCATEDS]),
+                ),
+                gamma=self.gamma,
+                lambda_=self.lambda_,
+            )
+            assert module_value_targets.shape[0] == sum(episode_lens)
+
+            module_advantages = module_value_targets - module_vf_preds
+            # Drop vf-preds, not needed in loss. Note that in the DefaultPPORLModule,
+            # vf-preds are recomputed with each `forward_train` call anyway to compute
+            # the vf loss.
+            # Standardize advantages (used for more stable and better weighted
+            # policy gradient computations).
+            module_advantages = (module_advantages - module_advantages.mean()) / max(
+                1e-4, module_advantages.std()
+            )
+
+            # Zero-pad the new computations, if necessary.
+            if module.is_stateful():
+                module_advantages = np.stack(
+                    split_and_zero_pad_n_episodes(
+                        module_advantages,
+                        episode_lens=episode_lens,
+                        max_seq_len=module.model_config["max_seq_len"],
+                    ),
+                    axis=0,
+                )
+                module_value_targets = np.stack(
+                    split_and_zero_pad_n_episodes(
+                        module_value_targets,
+                        episode_lens=episode_lens,
+                        max_seq_len=module.model_config["max_seq_len"],
+                    ),
+                    axis=0,
+                )
+            batch[module_id][Postprocessing.ADVANTAGES] = module_advantages
+            batch[module_id][Postprocessing.VALUE_TARGETS] = module_value_targets
+
+        # Convert all GAE results to tensors.
+        if self._numpy_to_tensor_connector is None:
+            self._numpy_to_tensor_connector = NumpyToTensor(
+                as_learner_connector=True, device=device
+            )
+        tensor_results = self._numpy_to_tensor_connector(
+            rl_module=rl_module,
+            batch={
+                mid: {
+                    Postprocessing.ADVANTAGES: module_batch[Postprocessing.ADVANTAGES],
+                    Postprocessing.VALUE_TARGETS: (
+                        module_batch[Postprocessing.VALUE_TARGETS]
+                    ),
+                }
+                for mid, module_batch in batch.items()
+                if vf_preds[mid] is not None
+            },
+            episodes=episodes,
+        )
+        # Move converted tensors back to `batch`.
+        for mid, module_batch in tensor_results.items():
+            batch[mid].update(module_batch)
+
+        return batch
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/learner_connector_pipeline.py b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/learner_connector_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..8129dad82e95791fa270a7f35ec6fd7cab36d303
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/connectors/learner/learner_connector_pipeline.py
@@ -0,0 +1,56 @@
+from typing import Any, Dict, List, Optional
+from ray.rllib.connectors.connector_pipeline_v2 import ConnectorPipelineV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.metrics import (
+    ALL_MODULES,
+    LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_IN,
+    LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_OUT,
+)
+from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+from ray.rllib.utils.typing import EpisodeType
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="alpha")
+class LearnerConnectorPipeline(ConnectorPipelineV2):
+    @override(ConnectorPipelineV2)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Optional[Dict[str, Any]] = None,
+        episodes: List[EpisodeType],
+        explore: bool = False,
+        shared_data: Optional[dict] = None,
+        metrics: Optional[MetricsLogger] = None,
+        **kwargs,
+    ):
+        # Log the sum of lengths of all episodes incoming.
+        if metrics:
+            metrics.log_value(
+                (ALL_MODULES, LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_IN),
+                sum(map(len, episodes)),
+            )
+
+        # Make sure user does not necessarily send initial input into this pipeline.
+        # Might just be empty and to be populated from `episodes`.
+        ret = super().__call__(
+            rl_module=rl_module,
+            batch=batch if batch is not None else {},
+            episodes=episodes,
+            shared_data=shared_data if shared_data is not None else {},
+            explore=explore,
+            metrics=metrics,
+            metrics_prefix_key=(ALL_MODULES,),
+            **kwargs,
+        )
+
+        # Log the sum of lengths of all episodes outgoing.
+        if metrics:
+            metrics.log_value(
+                (ALL_MODULES, LEARNER_CONNECTOR_SUM_EPISODES_LENGTH_OUT),
+                sum(map(len, episodes)),
+            )
+
+        return ret
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d7091e208fd2108d5b0f42fbf0a7ec27b850f0f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/__init__.py
@@ -0,0 +1,12 @@
+from ray.rllib.models.action_dist import ActionDistribution
+from ray.rllib.models.catalog import ModelCatalog, MODEL_DEFAULTS
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.preprocessors import Preprocessor
+
+__all__ = [
+    "ActionDistribution",
+    "ModelCatalog",
+    "ModelV2",
+    "Preprocessor",
+    "MODEL_DEFAULTS",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/action_dist.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/action_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cacfdef60c5e76831a23cd53bc326141ebcc54e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/action_dist.py
@@ -0,0 +1,94 @@
+import numpy as np
+import gymnasium as gym
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.typing import TensorType, List, Union, ModelConfigDict
+
+
+@OldAPIStack
+class ActionDistribution:
+    """The policy action distribution of an agent.
+
+    Attributes:
+        inputs: input vector to compute samples from.
+        model (ModelV2): reference to model producing the inputs.
+    """
+
+    def __init__(self, inputs: List[TensorType], model: ModelV2):
+        """Initializes an ActionDist object.
+
+        Args:
+            inputs: input vector to compute samples from.
+            model (ModelV2): reference to model producing the inputs. This
+                is mainly useful if you want to use model variables to compute
+                action outputs (i.e., for autoregressive action distributions,
+                see examples/autoregressive_action_dist.py).
+        """
+        self.inputs = inputs
+        self.model = model
+
+    def sample(self) -> TensorType:
+        """Draw a sample from the action distribution."""
+        raise NotImplementedError
+
+    def deterministic_sample(self) -> TensorType:
+        """
+        Get the deterministic "sampling" output from the distribution.
+        This is usually the max likelihood output, i.e. mean for Normal, argmax
+        for Categorical, etc..
+        """
+        raise NotImplementedError
+
+    def sampled_action_logp(self) -> TensorType:
+        """Returns the log probability of the last sampled action."""
+        raise NotImplementedError
+
+    def logp(self, x: TensorType) -> TensorType:
+        """The log-likelihood of the action distribution."""
+        raise NotImplementedError
+
+    def kl(self, other: "ActionDistribution") -> TensorType:
+        """The KL-divergence between two action distributions."""
+        raise NotImplementedError
+
+    def entropy(self) -> TensorType:
+        """The entropy of the action distribution."""
+        raise NotImplementedError
+
+    def multi_kl(self, other: "ActionDistribution") -> TensorType:
+        """The KL-divergence between two action distributions.
+
+        This differs from kl() in that it can return an array for
+        MultiDiscrete. TODO(ekl) consider removing this.
+        """
+        return self.kl(other)
+
+    def multi_entropy(self) -> TensorType:
+        """The entropy of the action distribution.
+
+        This differs from entropy() in that it can return an array for
+        MultiDiscrete. TODO(ekl) consider removing this.
+        """
+        return self.entropy()
+
+    @staticmethod
+    @OldAPIStack
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        """Returns the required shape of an input parameter tensor for a
+        particular action space and an optional dict of distribution-specific
+        options.
+
+        Args:
+            action_space (gym.Space): The action space this distribution will
+                be used for, whose shape attributes will be used to determine
+                the required shape of the input parameter tensor.
+            model_config: Model's config dict (as defined in catalog.py)
+
+        Returns:
+            model_output_shape (int or np.ndarray of ints): size of the
+                required input vector (minus leading batch dimension).
+        """
+        raise NotImplementedError
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/catalog.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff7ac243e6217037df6086e40330db3ad69c3eb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/catalog.py
@@ -0,0 +1,905 @@
+from functools import partial
+import gymnasium as gym
+from gymnasium.spaces import Box, Dict, Discrete, MultiDiscrete, Tuple
+import logging
+import numpy as np
+import tree  # pip install dm_tree
+from typing import List, Optional, Type, Union
+
+from ray.tune.registry import (
+    RLLIB_MODEL,
+    RLLIB_ACTION_DIST,
+    _global_registry,
+)
+from ray.rllib.models.action_dist import ActionDistribution
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.preprocessors import get_preprocessor, Preprocessor
+from ray.rllib.models.tf.tf_action_dist import (
+    Categorical,
+    Deterministic,
+    DiagGaussian,
+    Dirichlet,
+    MultiActionDistribution,
+    MultiCategorical,
+)
+from ray.rllib.models.torch.torch_action_dist import (
+    TorchCategorical,
+    TorchDeterministic,
+    TorchDirichlet,
+    TorchDiagGaussian,
+    TorchMultiActionDistribution,
+    TorchMultiCategorical,
+)
+from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI
+from ray.rllib.utils.deprecation import (
+    DEPRECATED_VALUE,
+    deprecation_warning,
+)
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.from_config import from_config
+from ray.rllib.utils.spaces.simplex import Simplex
+from ray.rllib.utils.spaces.space_utils import flatten_space
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+
+tf1, tf, tfv = try_import_tf()
+torch, _ = try_import_torch()
+
+logger = logging.getLogger(__name__)
+
+# fmt: off
+# __sphinx_doc_begin__
+MODEL_DEFAULTS: ModelConfigDict = {
+    "fcnet_hiddens": [256, 256],
+    "fcnet_activation": "tanh",
+    "fcnet_weights_initializer": None,
+    "fcnet_weights_initializer_config": None,
+    "fcnet_bias_initializer": None,
+    "fcnet_bias_initializer_config": None,
+    "conv_filters": None,
+    "conv_activation": "relu",
+    "conv_kernel_initializer": None,
+    "conv_kernel_initializer_config": None,
+    "conv_bias_initializer": None,
+    "conv_bias_initializer_config": None,
+    "conv_transpose_kernel_initializer": None,
+    "conv_transpose_kernel_initializer_config": None,
+    "conv_transpose_bias_initializer": None,
+    "conv_transpose_bias_initializer_config": None,
+    "post_fcnet_hiddens": [],
+    "post_fcnet_activation": "relu",
+    "post_fcnet_weights_initializer": None,
+    "post_fcnet_weights_initializer_config": None,
+    "post_fcnet_bias_initializer": None,
+    "post_fcnet_bias_initializer_config": None,
+    "free_log_std": False,
+    "log_std_clip_param": 20.0,
+    "no_final_linear": False,
+    "vf_share_layers": True,
+    "use_lstm": False,
+    "max_seq_len": 20,
+    "lstm_cell_size": 256,
+    "lstm_use_prev_action": False,
+    "lstm_use_prev_reward": False,
+    "lstm_weights_initializer": None,
+    "lstm_weights_initializer_config": None,
+    "lstm_bias_initializer": None,
+    "lstm_bias_initializer_config": None,
+    "_time_major": False,
+    "use_attention": False,
+    "attention_num_transformer_units": 1,
+    "attention_dim": 64,
+    "attention_num_heads": 1,
+    "attention_head_dim": 32,
+    "attention_memory_inference": 50,
+    "attention_memory_training": 50,
+    "attention_position_wise_mlp_dim": 32,
+    "attention_init_gru_gate_bias": 2.0,
+    "attention_use_n_prev_actions": 0,
+    "attention_use_n_prev_rewards": 0,
+    "framestack": True,
+    "dim": 84,
+    "grayscale": False,
+    "zero_mean": True,
+    "custom_model": None,
+    "custom_model_config": {},
+    "custom_action_dist": None,
+    "custom_preprocessor": None,
+    "encoder_latent_dim": None,
+    "always_check_shapes": False,
+
+    # Deprecated keys:
+    "lstm_use_prev_action_reward": DEPRECATED_VALUE,
+    "_use_default_native_models": DEPRECATED_VALUE,
+    "_disable_preprocessor_api": False,
+    "_disable_action_flattening": False,
+}
+# __sphinx_doc_end__
+# fmt: on
+
+
+@DeveloperAPI
+class ModelCatalog:
+    """Registry of models, preprocessors, and action distributions for envs.
+
+    .. testcode::
+        :skipif: True
+
+        prep = ModelCatalog.get_preprocessor(env)
+        observation = prep.transform(raw_observation)
+
+        dist_class, dist_dim = ModelCatalog.get_action_dist(
+            env.action_space, {})
+        model = ModelCatalog.get_model_v2(
+            obs_space, action_space, num_outputs, options)
+        dist = dist_class(model.outputs, model)
+        action = dist.sample()
+    """
+
+    @staticmethod
+    @DeveloperAPI
+    def get_action_dist(
+        action_space: gym.Space,
+        config: ModelConfigDict,
+        dist_type: Optional[Union[str, Type[ActionDistribution]]] = None,
+        framework: str = "tf",
+        **kwargs
+    ) -> (type, int):
+        """Returns a distribution class and size for the given action space.
+
+        Args:
+            action_space: Action space of the target gym env.
+            config (Optional[dict]): Optional model config.
+            dist_type (Optional[Union[str, Type[ActionDistribution]]]):
+                Identifier of the action distribution (str) interpreted as a
+                hint or the actual ActionDistribution class to use.
+            framework: One of "tf2", "tf", "torch", or "jax".
+            kwargs: Optional kwargs to pass on to the Distribution's
+                constructor.
+
+        Returns:
+            Tuple:
+                - dist_class (ActionDistribution): Python class of the
+                    distribution.
+                - dist_dim (int): The size of the input vector to the
+                    distribution.
+        """
+
+        dist_cls = None
+        config = config or MODEL_DEFAULTS
+        # Custom distribution given.
+        if config.get("custom_action_dist"):
+            custom_action_config = config.copy()
+            action_dist_name = custom_action_config.pop("custom_action_dist")
+            logger.debug("Using custom action distribution {}".format(action_dist_name))
+            dist_cls = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name)
+            return ModelCatalog._get_multi_action_distribution(
+                dist_cls, action_space, custom_action_config, framework
+            )
+
+        # Dist_type is given directly as a class.
+        elif (
+            type(dist_type) is type
+            and issubclass(dist_type, ActionDistribution)
+            and dist_type not in (MultiActionDistribution, TorchMultiActionDistribution)
+        ):
+            dist_cls = dist_type
+        # Box space -> DiagGaussian OR Deterministic.
+        elif isinstance(action_space, Box):
+            if action_space.dtype.name.startswith("int"):
+                low_ = np.min(action_space.low)
+                high_ = np.max(action_space.high)
+                dist_cls = (
+                    TorchMultiCategorical if framework == "torch" else MultiCategorical
+                )
+                num_cats = int(np.prod(action_space.shape))
+                return (
+                    partial(
+                        dist_cls,
+                        input_lens=[high_ - low_ + 1 for _ in range(num_cats)],
+                        action_space=action_space,
+                    ),
+                    num_cats * (high_ - low_ + 1),
+                )
+            else:
+                if len(action_space.shape) > 1:
+                    raise UnsupportedSpaceException(
+                        "Action space has multiple dimensions "
+                        "{}. ".format(action_space.shape)
+                        + "Consider reshaping this into a single dimension, "
+                        "using a custom action distribution, "
+                        "using a Tuple action space, or the multi-agent API."
+                    )
+                # TODO(sven): Check for bounds and return SquashedNormal, etc..
+                if dist_type is None:
+                    return (
+                        partial(
+                            TorchDiagGaussian if framework == "torch" else DiagGaussian,
+                            action_space=action_space,
+                        ),
+                        DiagGaussian.required_model_output_shape(action_space, config),
+                    )
+                elif dist_type == "deterministic":
+                    dist_cls = (
+                        TorchDeterministic if framework == "torch" else Deterministic
+                    )
+        # Discrete Space -> Categorical.
+        elif isinstance(action_space, Discrete):
+            if framework == "torch":
+                dist_cls = TorchCategorical
+            elif framework == "jax":
+                from ray.rllib.models.jax.jax_action_dist import JAXCategorical
+
+                dist_cls = JAXCategorical
+            else:
+                dist_cls = Categorical
+        # Tuple/Dict Spaces -> MultiAction.
+        elif dist_type in (
+            MultiActionDistribution,
+            TorchMultiActionDistribution,
+        ) or isinstance(action_space, (Tuple, Dict)):
+            return ModelCatalog._get_multi_action_distribution(
+                (
+                    MultiActionDistribution
+                    if framework == "tf"
+                    else TorchMultiActionDistribution
+                ),
+                action_space,
+                config,
+                framework,
+            )
+        # Simplex -> Dirichlet.
+        elif isinstance(action_space, Simplex):
+            dist_cls = TorchDirichlet if framework == "torch" else Dirichlet
+        # MultiDiscrete -> MultiCategorical.
+        elif isinstance(action_space, MultiDiscrete):
+            dist_cls = (
+                TorchMultiCategorical if framework == "torch" else MultiCategorical
+            )
+            return partial(dist_cls, input_lens=action_space.nvec), int(
+                sum(action_space.nvec)
+            )
+        # Unknown type -> Error.
+        else:
+            raise NotImplementedError(
+                "Unsupported args: {} {}".format(action_space, dist_type)
+            )
+
+        return dist_cls, int(dist_cls.required_model_output_shape(action_space, config))
+
+    @staticmethod
+    @DeveloperAPI
+    def get_action_shape(
+        action_space: gym.Space, framework: str = "tf"
+    ) -> (np.dtype, List[int]):
+        """Returns action tensor dtype and shape for the action space.
+
+        Args:
+            action_space: Action space of the target gym env.
+            framework: The framework identifier. One of "tf" or "torch".
+
+        Returns:
+            (dtype, shape): Dtype and shape of the actions tensor.
+        """
+        dl_lib = torch if framework == "torch" else tf
+        if isinstance(action_space, Discrete):
+            return action_space.dtype, (None,)
+        elif isinstance(action_space, (Box, Simplex)):
+            if np.issubdtype(action_space.dtype, np.floating):
+                return dl_lib.float32, (None,) + action_space.shape
+            elif np.issubdtype(action_space.dtype, np.integer):
+                return dl_lib.int32, (None,) + action_space.shape
+            else:
+                raise ValueError("RLlib doesn't support non int or float box spaces")
+        elif isinstance(action_space, MultiDiscrete):
+            return action_space.dtype, (None,) + action_space.shape
+        elif isinstance(action_space, (Tuple, Dict)):
+            flat_action_space = flatten_space(action_space)
+            size = 0
+            all_discrete = True
+            for i in range(len(flat_action_space)):
+                if isinstance(flat_action_space[i], Discrete):
+                    size += 1
+                else:
+                    all_discrete = False
+                    size += np.prod(flat_action_space[i].shape)
+            size = int(size)
+            return dl_lib.int32 if all_discrete else dl_lib.float32, (None, size)
+        else:
+            raise NotImplementedError(
+                "Action space {} not supported".format(action_space)
+            )
+
+    @staticmethod
+    @DeveloperAPI
+    def get_action_placeholder(
+        action_space: gym.Space, name: str = "action"
+    ) -> TensorType:
+        """Returns an action placeholder consistent with the action space
+
+        Args:
+            action_space: Action space of the target gym env.
+            name: An optional string to name the placeholder by.
+                Default: "action".
+
+        Returns:
+            action_placeholder: A placeholder for the actions
+        """
+        dtype, shape = ModelCatalog.get_action_shape(action_space, framework="tf")
+
+        return tf1.placeholder(dtype, shape=shape, name=name)
+
+    @staticmethod
+    @DeveloperAPI
+    def get_model_v2(
+        obs_space: gym.Space,
+        action_space: gym.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        framework: str = "tf",
+        name: str = "default_model",
+        model_interface: type = None,
+        default_model: type = None,
+        **model_kwargs
+    ) -> ModelV2:
+        """Returns a suitable model compatible with given spaces and output.
+
+        Args:
+            obs_space: Observation space of the target gym env. This
+                may have an `original_space` attribute that specifies how to
+                unflatten the tensor into a ragged tensor.
+            action_space: Action space of the target gym env.
+            num_outputs: The size of the output vector of the model.
+            model_config: The "model" sub-config dict
+                within the Algorithm's config dict.
+            framework: One of "tf2", "tf", "torch", or "jax".
+            name: Name (scope) for the model.
+            model_interface: Interface required for the model
+            default_model: Override the default class for the model. This
+                only has an effect when not using a custom model
+            model_kwargs: Args to pass to the ModelV2 constructor
+
+        Returns:
+            model (ModelV2): Model to use for the policy.
+        """
+
+        # Validate the given config dict.
+        ModelCatalog._validate_config(
+            config=model_config, action_space=action_space, framework=framework
+        )
+
+        if model_config.get("custom_model"):
+            # Allow model kwargs to be overridden / augmented by
+            # custom_model_config.
+            customized_model_kwargs = dict(
+                model_kwargs, **model_config.get("custom_model_config", {})
+            )
+
+            if isinstance(model_config["custom_model"], type):
+                model_cls = model_config["custom_model"]
+            elif (
+                isinstance(model_config["custom_model"], str)
+                and "." in model_config["custom_model"]
+            ):
+                return from_config(
+                    cls=model_config["custom_model"],
+                    obs_space=obs_space,
+                    action_space=action_space,
+                    num_outputs=num_outputs,
+                    model_config=customized_model_kwargs,
+                    name=name,
+                )
+            else:
+                model_cls = _global_registry.get(
+                    RLLIB_MODEL, model_config["custom_model"]
+                )
+
+            # Only allow ModelV2 or native keras Models.
+            if not issubclass(model_cls, ModelV2):
+                if framework not in ["tf", "tf2"] or not issubclass(
+                    model_cls, tf.keras.Model
+                ):
+                    raise ValueError(
+                        "`model_cls` must be a ModelV2 sub-class, but is"
+                        " {}!".format(model_cls)
+                    )
+
+            logger.info("Wrapping {} as {}".format(model_cls, model_interface))
+            model_cls = ModelCatalog._wrap_if_needed(model_cls, model_interface)
+
+            if framework in ["tf2", "tf"]:
+                # Try wrapping custom model with LSTM/attention, if required.
+                if model_config.get("use_lstm") or model_config.get("use_attention"):
+                    from ray.rllib.models.tf.attention_net import (
+                        AttentionWrapper,
+                    )
+                    from ray.rllib.models.tf.recurrent_net import (
+                        LSTMWrapper,
+                    )
+
+                    wrapped_cls = model_cls
+                    forward = wrapped_cls.forward
+                    model_cls = ModelCatalog._wrap_if_needed(
+                        wrapped_cls,
+                        LSTMWrapper
+                        if model_config.get("use_lstm")
+                        else AttentionWrapper,
+                    )
+                    model_cls._wrapped_forward = forward
+
+                # Obsolete: Track and warn if vars were created but not
+                # registered. Only still do this, if users do register their
+                # variables. If not (which they shouldn't), don't check here.
+                created = set()
+
+                def track_var_creation(next_creator, **kw):
+                    v = next_creator(**kw)
+                    created.add(v.ref())
+                    return v
+
+                with tf.variable_creator_scope(track_var_creation):
+                    if issubclass(model_cls, tf.keras.Model):
+                        instance = model_cls(
+                            input_space=obs_space,
+                            action_space=action_space,
+                            num_outputs=num_outputs,
+                            name=name,
+                            **customized_model_kwargs,
+                        )
+                    else:
+                        # Try calling with kwargs first (custom ModelV2 should
+                        # accept these as kwargs, not get them from
+                        # config["custom_model_config"] anymore).
+                        try:
+                            instance = model_cls(
+                                obs_space,
+                                action_space,
+                                num_outputs,
+                                model_config,
+                                name,
+                                **customized_model_kwargs,
+                            )
+                        except TypeError as e:
+                            # Keyword error: Try old way w/o kwargs.
+                            if "__init__() got an unexpected " in e.args[0]:
+                                instance = model_cls(
+                                    obs_space,
+                                    action_space,
+                                    num_outputs,
+                                    model_config,
+                                    name,
+                                    **model_kwargs,
+                                )
+                                logger.warning(
+                                    "Custom ModelV2 should accept all custom "
+                                    "options as **kwargs, instead of expecting"
+                                    " them in config['custom_model_config']!"
+                                )
+                            # Other error -> re-raise.
+                            else:
+                                raise e
+
+                # User still registered TFModelV2's variables: Check, whether
+                # ok.
+                registered = []
+                if not isinstance(instance, tf.keras.Model):
+                    registered = set(instance.var_list)
+                if len(registered) > 0:
+                    not_registered = set()
+                    for var in created:
+                        if var not in registered:
+                            not_registered.add(var)
+                    if not_registered:
+                        raise ValueError(
+                            "It looks like you are still using "
+                            "`{}.register_variables()` to register your "
+                            "model's weights. This is no longer required, but "
+                            "if you are still calling this method at least "
+                            "once, you must make sure to register all created "
+                            "variables properly. The missing variables are {},"
+                            " and you only registered {}. "
+                            "Did you forget to call `register_variables()` on "
+                            "some of the variables in question?".format(
+                                instance, not_registered, registered
+                            )
+                        )
+            elif framework == "torch":
+                # Try wrapping custom model with LSTM/attention, if required.
+                if model_config.get("use_lstm") or model_config.get("use_attention"):
+                    from ray.rllib.models.torch.attention_net import AttentionWrapper
+                    from ray.rllib.models.torch.recurrent_net import LSTMWrapper
+
+                    wrapped_cls = model_cls
+                    forward = wrapped_cls.forward
+                    model_cls = ModelCatalog._wrap_if_needed(
+                        wrapped_cls,
+                        LSTMWrapper
+                        if model_config.get("use_lstm")
+                        else AttentionWrapper,
+                    )
+                    model_cls._wrapped_forward = forward
+
+                # PyTorch automatically tracks nn.Modules inside the parent
+                # nn.Module's constructor.
+                # Try calling with kwargs first (custom ModelV2 should
+                # accept these as kwargs, not get them from
+                # config["custom_model_config"] anymore).
+                try:
+                    instance = model_cls(
+                        obs_space,
+                        action_space,
+                        num_outputs,
+                        model_config,
+                        name,
+                        **customized_model_kwargs,
+                    )
+                except TypeError as e:
+                    # Keyword error: Try old way w/o kwargs.
+                    if "__init__() got an unexpected " in e.args[0]:
+                        instance = model_cls(
+                            obs_space,
+                            action_space,
+                            num_outputs,
+                            model_config,
+                            name,
+                            **model_kwargs,
+                        )
+                        logger.warning(
+                            "Custom ModelV2 should accept all custom "
+                            "options as **kwargs, instead of expecting"
+                            " them in config['custom_model_config']!"
+                        )
+                    # Other error -> re-raise.
+                    else:
+                        raise e
+            else:
+                raise NotImplementedError(
+                    "`framework` must be 'tf2|tf|torch', but is "
+                    "{}!".format(framework)
+                )
+
+            return instance
+
+        # Find a default TFModelV2 and wrap with model_interface.
+        if framework in ["tf", "tf2"]:
+            v2_class = None
+            # Try to get a default v2 model.
+            if not model_config.get("custom_model"):
+                v2_class = default_model or ModelCatalog._get_v2_model_class(
+                    obs_space, model_config, framework=framework
+                )
+
+            if not v2_class:
+                raise ValueError("ModelV2 class could not be determined!")
+
+            if model_config.get("use_lstm") or model_config.get("use_attention"):
+                from ray.rllib.models.tf.attention_net import (
+                    AttentionWrapper,
+                )
+                from ray.rllib.models.tf.recurrent_net import (
+                    LSTMWrapper,
+                )
+
+                wrapped_cls = v2_class
+                if model_config.get("use_lstm"):
+                    v2_class = ModelCatalog._wrap_if_needed(wrapped_cls, LSTMWrapper)
+                    v2_class._wrapped_forward = wrapped_cls.forward
+                else:
+                    v2_class = ModelCatalog._wrap_if_needed(
+                        wrapped_cls, AttentionWrapper
+                    )
+                    v2_class._wrapped_forward = wrapped_cls.forward
+
+            # Wrap in the requested interface.
+            wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface)
+
+            if issubclass(wrapper, tf.keras.Model):
+                model = wrapper(
+                    input_space=obs_space,
+                    action_space=action_space,
+                    num_outputs=num_outputs,
+                    name=name,
+                    **dict(model_kwargs, **model_config),
+                )
+                return model
+
+            return wrapper(
+                obs_space, action_space, num_outputs, model_config, name, **model_kwargs
+            )
+
+        # Find a default TorchModelV2 and wrap with model_interface.
+        elif framework == "torch":
+            # Try to get a default v2 model.
+            if not model_config.get("custom_model"):
+                v2_class = default_model or ModelCatalog._get_v2_model_class(
+                    obs_space, model_config, framework=framework
+                )
+
+            if not v2_class:
+                raise ValueError("ModelV2 class could not be determined!")
+
+            if model_config.get("use_lstm") or model_config.get("use_attention"):
+                from ray.rllib.models.torch.attention_net import AttentionWrapper
+                from ray.rllib.models.torch.recurrent_net import LSTMWrapper
+
+                wrapped_cls = v2_class
+                forward = wrapped_cls.forward
+                if model_config.get("use_lstm"):
+                    v2_class = ModelCatalog._wrap_if_needed(wrapped_cls, LSTMWrapper)
+                else:
+                    v2_class = ModelCatalog._wrap_if_needed(
+                        wrapped_cls, AttentionWrapper
+                    )
+
+                v2_class._wrapped_forward = forward
+
+            # Wrap in the requested interface.
+            wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface)
+            return wrapper(
+                obs_space, action_space, num_outputs, model_config, name, **model_kwargs
+            )
+
+        # Find a default JAXModelV2 and wrap with model_interface.
+        elif framework == "jax":
+            v2_class = default_model or ModelCatalog._get_v2_model_class(
+                obs_space, model_config, framework=framework
+            )
+            # Wrap in the requested interface.
+            wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface)
+            return wrapper(
+                obs_space, action_space, num_outputs, model_config, name, **model_kwargs
+            )
+        else:
+            raise NotImplementedError(
+                "`framework` must be 'tf2|tf|torch', but is " "{}!".format(framework)
+            )
+
+    @staticmethod
+    @DeveloperAPI
+    def get_preprocessor(
+        env: gym.Env, options: Optional[dict] = None, include_multi_binary: bool = False
+    ) -> Preprocessor:
+        """Returns a suitable preprocessor for the given env.
+
+        This is a wrapper for get_preprocessor_for_space().
+        """
+
+        return ModelCatalog.get_preprocessor_for_space(
+            env.observation_space, options, include_multi_binary
+        )
+
+    @staticmethod
+    @DeveloperAPI
+    def get_preprocessor_for_space(
+        observation_space: gym.Space,
+        options: dict = None,
+        include_multi_binary: bool = False,
+    ) -> Preprocessor:
+        """Returns a suitable preprocessor for the given observation space.
+
+        Args:
+            observation_space: The input observation space.
+            options: Options to pass to the preprocessor.
+            include_multi_binary: Whether to include the MultiBinaryPreprocessor in
+                the possible preprocessors returned by this method.
+
+        Returns:
+            preprocessor: Preprocessor for the observations.
+        """
+
+        options = options or MODEL_DEFAULTS
+        for k in options.keys():
+            if k not in MODEL_DEFAULTS:
+                raise Exception(
+                    "Unknown config key `{}`, all keys: {}".format(
+                        k, list(MODEL_DEFAULTS)
+                    )
+                )
+
+        cls = get_preprocessor(
+            observation_space, include_multi_binary=include_multi_binary
+        )
+        prep = cls(observation_space, options)
+
+        if prep is not None:
+            logger.debug(
+                "Created preprocessor {}: {} -> {}".format(
+                    prep, observation_space, prep.shape
+                )
+            )
+        return prep
+
+    @staticmethod
+    @PublicAPI
+    def register_custom_model(model_name: str, model_class: type) -> None:
+        """Register a custom model class by name.
+
+        The model can be later used by specifying {"custom_model": model_name}
+        in the model config.
+
+        Args:
+            model_name: Name to register the model under.
+            model_class: Python class of the model.
+        """
+        if tf is not None:
+            if issubclass(model_class, tf.keras.Model):
+                deprecation_warning(old="register_custom_model", error=False)
+        _global_registry.register(RLLIB_MODEL, model_name, model_class)
+
+    @staticmethod
+    @PublicAPI
+    def register_custom_action_dist(
+        action_dist_name: str, action_dist_class: type
+    ) -> None:
+        """Register a custom action distribution class by name.
+
+        The model can be later used by specifying
+        {"custom_action_dist": action_dist_name} in the model config.
+
+        Args:
+            model_name: Name to register the action distribution under.
+            model_class: Python class of the action distribution.
+        """
+        _global_registry.register(
+            RLLIB_ACTION_DIST, action_dist_name, action_dist_class
+        )
+
+    @staticmethod
+    def _wrap_if_needed(model_cls: type, model_interface: type) -> type:
+        if not model_interface or issubclass(model_cls, model_interface):
+            return model_cls
+
+        assert issubclass(model_cls, ModelV2), model_cls
+
+        class wrapper(model_interface, model_cls):
+            pass
+
+        name = "{}_as_{}".format(model_cls.__name__, model_interface.__name__)
+        wrapper.__name__ = name
+        wrapper.__qualname__ = name
+
+        return wrapper
+
+    @staticmethod
+    def _get_v2_model_class(
+        input_space: gym.Space, model_config: ModelConfigDict, framework: str = "tf"
+    ) -> Type[ModelV2]:
+        VisionNet = None
+        ComplexNet = None
+
+        if framework in ["tf2", "tf"]:
+            from ray.rllib.models.tf.fcnet import (
+                FullyConnectedNetwork as FCNet,
+            )
+            from ray.rllib.models.tf.visionnet import (
+                VisionNetwork as VisionNet,
+            )
+            from ray.rllib.models.tf.complex_input_net import (
+                ComplexInputNetwork as ComplexNet,
+            )
+        elif framework == "torch":
+            from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as FCNet
+            from ray.rllib.models.torch.visionnet import VisionNetwork as VisionNet
+            from ray.rllib.models.torch.complex_input_net import (
+                ComplexInputNetwork as ComplexNet,
+            )
+        elif framework == "jax":
+            from ray.rllib.models.jax.fcnet import FullyConnectedNetwork as FCNet
+        else:
+            raise ValueError(
+                "framework={} not supported in `ModelCatalog._get_v2_model_"
+                "class`!".format(framework)
+            )
+
+        orig_space = (
+            input_space
+            if not hasattr(input_space, "original_space")
+            else input_space.original_space
+        )
+
+        # `input_space` is 3D Box -> VisionNet.
+        if isinstance(input_space, Box) and len(input_space.shape) == 3:
+            if framework == "jax":
+                raise NotImplementedError("No non-FC default net for JAX yet!")
+            return VisionNet
+        # `input_space` is 1D Box -> FCNet.
+        elif (
+            isinstance(input_space, Box)
+            and len(input_space.shape) == 1
+            and (
+                not isinstance(orig_space, (Dict, Tuple))
+                or not any(
+                    isinstance(s, Box) and len(s.shape) >= 2
+                    for s in flatten_space(orig_space)
+                )
+            )
+        ):
+            return FCNet
+        # Complex (Dict, Tuple, 2D Box (flatten), Discrete, MultiDiscrete).
+        else:
+            if framework == "jax":
+                raise NotImplementedError("No non-FC default net for JAX yet!")
+            return ComplexNet
+
+    @staticmethod
+    def _get_multi_action_distribution(dist_class, action_space, config, framework):
+        # In case the custom distribution is a child of MultiActionDistr.
+        # If users want to completely ignore the suggested child
+        # distributions, they should simply do so in their custom class'
+        # constructor.
+        if issubclass(
+            dist_class, (MultiActionDistribution, TorchMultiActionDistribution)
+        ):
+            flat_action_space = flatten_space(action_space)
+            child_dists_and_in_lens = tree.map_structure(
+                lambda s: ModelCatalog.get_action_dist(s, config, framework=framework),
+                flat_action_space,
+            )
+            child_dists = [e[0] for e in child_dists_and_in_lens]
+            input_lens = [int(e[1]) for e in child_dists_and_in_lens]
+            return (
+                partial(
+                    dist_class,
+                    action_space=action_space,
+                    child_distributions=child_dists,
+                    input_lens=input_lens,
+                ),
+                int(sum(input_lens)),
+            )
+        return dist_class, dist_class.required_model_output_shape(action_space, config)
+
+    @staticmethod
+    def _validate_config(
+        config: ModelConfigDict, action_space: gym.spaces.Space, framework: str
+    ) -> None:
+        """Validates a given model config dict.
+
+        Args:
+            config: The "model" sub-config dict
+                within the Algorithm's config dict.
+            action_space: The action space of the model, whose config are
+                    validated.
+            framework: One of "jax", "tf2", "tf", or "torch".
+
+        Raises:
+            ValueError: If something is wrong with the given config.
+        """
+        # Soft-deprecate custom preprocessors.
+        if config.get("custom_preprocessor") is not None:
+            deprecation_warning(
+                old="model.custom_preprocessor",
+                new="gym.ObservationWrapper around your env or handle complex "
+                "inputs inside your Model",
+                error=True,
+            )
+
+        if config.get("use_attention") and config.get("use_lstm"):
+            raise ValueError(
+                "Only one of `use_lstm` or `use_attention` may be set to True!"
+            )
+
+        # For complex action spaces, only allow prev action inputs to
+        # LSTMs and attention nets iff `_disable_action_flattening=True`.
+        # TODO: `_disable_action_flattening=True` will be the default in
+        #  the future.
+        if (
+            (
+                config.get("lstm_use_prev_action")
+                or config.get("attention_use_n_prev_actions", 0) > 0
+            )
+            and not config.get("_disable_action_flattening")
+            and isinstance(action_space, (Tuple, Dict))
+        ):
+            raise ValueError(
+                "For your complex action space (Tuple|Dict) and your model's "
+                "`prev-actions` setup of your model, you must set "
+                "`_disable_action_flattening=True` in your main config dict!"
+            )
+
+        if framework == "jax":
+            if config.get("use_attention"):
+                raise ValueError(
+                    "`use_attention` not available for framework=jax so far!"
+                )
+            elif config.get("use_lstm"):
+                raise ValueError("`use_lstm` not available for framework=jax so far!")
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/distributions.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda55acd27702f940fac46ad7835f4b91c112f53
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/distributions.py
@@ -0,0 +1,248 @@
+"""This is the next version of action distribution base class."""
+from typing import Tuple
+import gymnasium as gym
+import abc
+
+from ray.rllib.utils.annotations import ExperimentalAPI
+from ray.rllib.utils.typing import TensorType, Union
+from ray.rllib.utils.annotations import override
+
+
+@ExperimentalAPI
+class Distribution(abc.ABC):
+    """The base class for distribution over a random variable.
+
+    Examples:
+
+    .. testcode::
+
+        import torch
+        from ray.rllib.core.models.configs import MLPHeadConfig
+        from ray.rllib.models.torch.torch_distributions import TorchCategorical
+
+        model = MLPHeadConfig(input_dims=[1]).build(framework="torch")
+
+        # Create an action distribution from model logits
+        action_logits = model(torch.Tensor([[1]]))
+        action_dist = TorchCategorical.from_logits(action_logits)
+        action = action_dist.sample()
+
+        # Create another distribution from a dummy Tensor
+        action_dist2 = TorchCategorical.from_logits(torch.Tensor([0]))
+
+        # Compute some common metrics
+        logp = action_dist.logp(action)
+        kl = action_dist.kl(action_dist2)
+        entropy = action_dist.entropy()
+    """
+
+    @abc.abstractmethod
+    def sample(
+        self,
+        *,
+        sample_shape: Tuple[int, ...] = None,
+        return_logp: bool = False,
+        **kwargs,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        """Draw a sample from the distribution.
+
+        Args:
+            sample_shape: The shape of the sample to draw.
+            return_logp: Whether to return the logp of the sampled values.
+            **kwargs: Forward compatibility placeholder.
+
+        Returns:
+            The sampled values. If return_logp is True, returns a tuple of the
+            sampled values and its logp.
+        """
+
+    @abc.abstractmethod
+    def rsample(
+        self,
+        *,
+        sample_shape: Tuple[int, ...] = None,
+        return_logp: bool = False,
+        **kwargs,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        """Draw a re-parameterized sample from the action distribution.
+
+        If this method is implemented, we can take gradients of samples w.r.t. the
+        distribution parameters.
+
+        Args:
+            sample_shape: The shape of the sample to draw.
+            return_logp: Whether to return the logp of the sampled values.
+            **kwargs: Forward compatibility placeholder.
+
+        Returns:
+            The sampled values. If return_logp is True, returns a tuple of the
+            sampled values and its logp.
+        """
+
+    @abc.abstractmethod
+    def logp(self, value: TensorType, **kwargs) -> TensorType:
+        """The log-likelihood of the distribution computed at `value`
+
+        Args:
+            value: The value to compute the log-likelihood at.
+            **kwargs: Forward compatibility placeholder.
+
+        Returns:
+            The log-likelihood of the value.
+        """
+
+    @abc.abstractmethod
+    def kl(self, other: "Distribution", **kwargs) -> TensorType:
+        """The KL-divergence between two distributions.
+
+        Args:
+            other: The other distribution.
+            **kwargs: Forward compatibility placeholder.
+
+        Returns:
+            The KL-divergence between the two distributions.
+        """
+
+    @abc.abstractmethod
+    def entropy(self, **kwargs) -> TensorType:
+        """The entropy of the distribution.
+
+        Args:
+            **kwargs: Forward compatibility placeholder.
+
+        Returns:
+            The entropy of the distribution.
+        """
+
+    @staticmethod
+    @abc.abstractmethod
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        """Returns the required length of an input parameter tensor.
+
+        Args:
+            space: The space this distribution will be used for,
+                whose shape attributes will be used to determine the required shape of
+                the input parameter tensor.
+            **kwargs: Forward compatibility placeholder.
+
+        Returns:
+            size of the required input vector (minus leading batch dimension).
+        """
+
+    @classmethod
+    def from_logits(cls, logits: TensorType, **kwargs) -> "Distribution":
+        """Creates a Distribution from logits.
+
+        The caller does not need to have knowledge of the distribution class in order
+        to create it and sample from it. The passed batched logits vectors might be
+        split up and are passed to the distribution class' constructor as kwargs.
+
+        Args:
+            logits: The logits to create the distribution from.
+            **kwargs: Forward compatibility placeholder.
+
+        Returns:
+            The created distribution.
+
+        .. testcode::
+
+            import numpy as np
+            from ray.rllib.models.distributions import Distribution
+
+            class Uniform(Distribution):
+                def __init__(self, lower, upper):
+                    self.lower = lower
+                    self.upper = upper
+
+                def sample(self):
+                    return self.lower + (self.upper - self.lower) * np.random.rand()
+
+                def logp(self, x):
+                    ...
+
+                def kl(self, other):
+                    ...
+
+                def entropy(self):
+                    ...
+
+                @staticmethod
+                def required_input_dim(space):
+                    ...
+
+                def rsample(self):
+                    ...
+
+                @classmethod
+                def from_logits(cls, logits, **kwargs):
+                    return Uniform(logits[:, 0], logits[:, 1])
+
+            logits = np.array([[0.0, 1.0], [2.0, 3.0]])
+            my_dist = Uniform.from_logits(logits)
+            sample = my_dist.sample()
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_partial_dist_cls(
+        parent_cls: "Distribution", **partial_kwargs
+    ) -> "Distribution":
+        """Returns a partial child of TorchMultiActionDistribution.
+
+        This is useful if inputs needed to instantiate the Distribution from logits
+        are available, but the logits are not.
+        """
+
+        class DistributionPartial(parent_cls):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+
+            @staticmethod
+            def _merge_kwargs(**kwargs):
+                """Checks if keys in kwargs don't clash with partial_kwargs."""
+                overlap = set(kwargs) & set(partial_kwargs)
+                if overlap:
+                    raise ValueError(
+                        f"Cannot override the following kwargs: {overlap}.\n"
+                        f"This is because they were already set at the time this "
+                        f"partial class was defined."
+                    )
+                merged_kwargs = {**partial_kwargs, **kwargs}
+                return merged_kwargs
+
+            @classmethod
+            @override(parent_cls)
+            def required_input_dim(cls, space: gym.Space, **kwargs) -> int:
+                merged_kwargs = cls._merge_kwargs(**kwargs)
+                assert space == merged_kwargs["space"]
+                return parent_cls.required_input_dim(**merged_kwargs)
+
+            @classmethod
+            @override(parent_cls)
+            def from_logits(
+                cls,
+                logits: TensorType,
+                **kwargs,
+            ) -> "DistributionPartial":
+                merged_kwargs = cls._merge_kwargs(**kwargs)
+                distribution = parent_cls.from_logits(logits, **merged_kwargs)
+                # Replace the class of the returned distribution with this partial
+                # This makes it so that we can use type() on this distribution and
+                # get back the partial class.
+                distribution.__class__ = cls
+                return distribution
+
+        # Substitute name of this partial class to match the original class.
+        DistributionPartial.__name__ = f"{parent_cls}Partial"
+
+        return DistributionPartial
+
+    def to_deterministic(self) -> "Distribution":
+        """Returns a deterministic equivalent for this distribution.
+
+        Specifically, the deterministic equivalent for a Categorical distribution is a
+        Deterministic distribution that selects the action with maximum logit value.
+        Generally, the choice of the deterministic replacement is informed by
+        established conventions.
+        """
+        return self
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/modelv2.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/modelv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..df07150e57bac2ead3c235f0abfe23f696921b5f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/modelv2.py
@@ -0,0 +1,471 @@
+from collections import OrderedDict
+import contextlib
+import gymnasium as gym
+from gymnasium.spaces import Space
+import numpy as np
+from typing import Dict, List, Any, Union
+
+from ray.rllib.models.preprocessors import get_preprocessor, RepeatedValuesPreprocessor
+from ray.rllib.models.repeated_values import RepeatedValues
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils import NullContextManager
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.framework import try_import_tf, try_import_torch, TensorType
+from ray.rllib.utils.spaces.repeated import Repeated
+from ray.rllib.utils.typing import ModelConfigDict, ModelInputDict, TensorStructType
+
+tf1, tf, tfv = try_import_tf()
+torch, _ = try_import_torch()
+
+
+@OldAPIStack
+class ModelV2:
+    r"""Defines an abstract neural network model for use with RLlib.
+
+    Custom models should extend either TFModelV2 or TorchModelV2 instead of
+    this class directly.
+
+    Data flow:
+        obs -> forward() -> model_out
+            \-> value_function() -> V(s)
+    """
+
+    def __init__(
+        self,
+        obs_space: Space,
+        action_space: Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+        framework: str,
+    ):
+        """Initializes a ModelV2 instance.
+
+        This method should create any variables used by the model.
+
+        Args:
+            obs_space: Observation space of the target gym
+                env. This may have an `original_space` attribute that
+                specifies how to unflatten the tensor into a ragged tensor.
+            action_space: Action space of the target gym
+                env.
+            num_outputs: Number of output units of the model.
+            model_config: Config for the model, documented
+                in ModelCatalog.
+            name: Name (scope) for the model.
+            framework: Either "tf" or "torch".
+        """
+
+        self.obs_space: Space = obs_space
+        self.action_space: Space = action_space
+        self.num_outputs: int = num_outputs
+        self.model_config: ModelConfigDict = model_config
+        self.name: str = name or "default_model"
+        self.framework: str = framework
+        self._last_output = None
+        self.time_major = self.model_config.get("_time_major")
+        # Basic view requirement for all models: Use the observation as input.
+        self.view_requirements = {
+            SampleBatch.OBS: ViewRequirement(shift=0, space=self.obs_space),
+        }
+
+    def get_initial_state(self) -> List[TensorType]:
+        """Get the initial recurrent state values for the model.
+
+        Returns:
+            List of np.array (for tf) or Tensor (for torch) objects containing the
+            initial hidden state of an RNN, if applicable.
+
+        .. testcode::
+            :skipif: True
+
+            import numpy as np
+            from ray.rllib.models.modelv2 import ModelV2
+            class MyModel(ModelV2):
+                # ...
+                def get_initial_state(self):
+                    return [
+                        np.zeros(self.cell_size, np.float32),
+                        np.zeros(self.cell_size, np.float32),
+                    ]
+        """
+        return []
+
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> (TensorType, List[TensorType]):
+        """Call the model with the given input tensors and state.
+
+        Any complex observations (dicts, tuples, etc.) will be unpacked by
+        __call__ before being passed to forward(). To access the flattened
+        observation tensor, refer to input_dict["obs_flat"].
+
+        This method can be called any number of times. In eager execution,
+        each call to forward() will eagerly evaluate the model. In symbolic
+        execution, each call to forward creates a computation graph that
+        operates over the variables of this model (i.e., shares weights).
+
+        Custom models should override this instead of __call__.
+
+        Args:
+            input_dict: dictionary of input tensors, including "obs",
+                "obs_flat", "prev_action", "prev_reward", "is_training",
+                "eps_id", "agent_id", "infos", and "t".
+            state: list of state tensors with sizes matching those
+                returned by get_initial_state + the batch dimension
+            seq_lens: 1d tensor holding input sequence lengths
+
+        Returns:
+            A tuple consisting of the model output tensor of size
+            [BATCH, num_outputs] and the list of new RNN state(s) if any.
+
+        .. testcode::
+            :skipif: True
+
+            import numpy as np
+            from ray.rllib.models.modelv2 import ModelV2
+            class MyModel(ModelV2):
+                # ...
+                def forward(self, input_dict, state, seq_lens):
+                    model_out, self._value_out = self.base_model(
+                        input_dict["obs"])
+                    return model_out, state
+        """
+        raise NotImplementedError
+
+    def value_function(self) -> TensorType:
+        """Returns the value function output for the most recent forward pass.
+
+        Note that a `forward` call has to be performed first, before this
+        methods can return anything and thus that calling this method does not
+        cause an extra forward pass through the network.
+
+        Returns:
+            Value estimate tensor of shape [BATCH].
+        """
+        raise NotImplementedError
+
+    def custom_loss(
+        self, policy_loss: TensorType, loss_inputs: Dict[str, TensorType]
+    ) -> Union[List[TensorType], TensorType]:
+        """Override to customize the loss function used to optimize this model.
+
+        This can be used to incorporate self-supervised losses (by defining
+        a loss over existing input and output tensors of this model), and
+        supervised losses (by defining losses over a variable-sharing copy of
+        this model's layers).
+
+        You can find an runnable example in examples/custom_loss.py.
+
+        Args:
+            policy_loss: List of or single policy loss(es) from the policy.
+            loss_inputs: map of input placeholders for rollout data.
+
+        Returns:
+            List of or scalar tensor for the customized loss(es) for this
+            model.
+        """
+        return policy_loss
+
+    def metrics(self) -> Dict[str, TensorType]:
+        """Override to return custom metrics from your model.
+
+        The stats will be reported as part of the learner stats, i.e.,
+        info.learner.[policy_id, e.g. "default_policy"].model.key1=metric1
+
+        Returns:
+            The custom metrics for this model.
+        """
+        return {}
+
+    def __call__(
+        self,
+        input_dict: Union[SampleBatch, ModelInputDict],
+        state: List[Any] = None,
+        seq_lens: TensorType = None,
+    ) -> (TensorType, List[TensorType]):
+        """Call the model with the given input tensors and state.
+
+        This is the method used by RLlib to execute the forward pass. It calls
+        forward() internally after unpacking nested observation tensors.
+
+        Custom models should override forward() instead of __call__.
+
+        Args:
+            input_dict: Dictionary of input tensors.
+            state: list of state tensors with sizes matching those
+                returned by get_initial_state + the batch dimension
+            seq_lens: 1D tensor holding input sequence lengths.
+
+        Returns:
+            A tuple consisting of the model output tensor of size
+                [BATCH, output_spec.size] or a list of tensors corresponding to
+                output_spec.shape_list, and a list of state tensors of
+                [BATCH, state_size_i] if any.
+        """
+
+        # Original observations will be stored in "obs".
+        # Flattened (preprocessed) obs will be stored in "obs_flat".
+
+        # SampleBatch case: Models can now be called directly with a
+        # SampleBatch (which also includes tracking-dict case (deprecated now),
+        # where tensors get automatically converted).
+        if isinstance(input_dict, SampleBatch):
+            restored = input_dict.copy(shallow=True)
+        else:
+            restored = input_dict.copy()
+
+        # Backward compatibility.
+        if not state:
+            state = []
+            i = 0
+            while "state_in_{}".format(i) in input_dict:
+                state.append(input_dict["state_in_{}".format(i)])
+                i += 1
+        if seq_lens is None:
+            seq_lens = input_dict.get(SampleBatch.SEQ_LENS)
+
+        # No Preprocessor used: `config._disable_preprocessor_api`=True.
+        # TODO: This is unnecessary for when no preprocessor is used.
+        #  Obs are not flat then anymore. However, we'll keep this
+        #  here for backward-compatibility until Preprocessors have
+        #  been fully deprecated.
+        if self.model_config.get("_disable_preprocessor_api"):
+            restored["obs_flat"] = input_dict["obs"]
+        # Input to this Model went through a Preprocessor.
+        # Generate extra keys: "obs_flat" (vs "obs", which will hold the
+        # original obs).
+        else:
+            restored["obs"] = restore_original_dimensions(
+                input_dict["obs"], self.obs_space, self.framework
+            )
+            try:
+                if len(input_dict["obs"].shape) > 2:
+                    restored["obs_flat"] = flatten(input_dict["obs"], self.framework)
+                else:
+                    restored["obs_flat"] = input_dict["obs"]
+            except AttributeError:
+                restored["obs_flat"] = input_dict["obs"]
+
+        with self.context():
+            res = self.forward(restored, state or [], seq_lens)
+
+        if isinstance(input_dict, SampleBatch):
+            input_dict.accessed_keys = restored.accessed_keys - {"obs_flat"}
+            input_dict.deleted_keys = restored.deleted_keys
+            input_dict.added_keys = restored.added_keys - {"obs_flat"}
+
+        if (not isinstance(res, list) and not isinstance(res, tuple)) or len(res) != 2:
+            raise ValueError(
+                "forward() must return a tuple of (output, state) tensors, "
+                "got {}".format(res)
+            )
+        outputs, state_out = res
+
+        if not isinstance(state_out, list):
+            raise ValueError("State output is not a list: {}".format(state_out))
+
+        self._last_output = outputs
+        return outputs, state_out if len(state_out) > 0 else (state or [])
+
+    def last_output(self) -> TensorType:
+        """Returns the last output returned from calling the model."""
+        return self._last_output
+
+    def context(self) -> contextlib.AbstractContextManager:
+        """Returns a contextmanager for the current forward pass."""
+        return NullContextManager()
+
+    def variables(
+        self, as_dict: bool = False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
+        """Returns the list (or a dict) of variables for this model.
+
+        Args:
+            as_dict: Whether variables should be returned as dict-values
+                (using descriptive str keys).
+
+        Returns:
+            The list (or dict if `as_dict` is True) of all variables of this
+            ModelV2.
+        """
+        raise NotImplementedError
+
+    def trainable_variables(
+        self, as_dict: bool = False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
+        """Returns the list of trainable variables for this model.
+
+        Args:
+            as_dict: Whether variables should be returned as dict-values
+                (using descriptive keys).
+
+        Returns:
+            The list (or dict if `as_dict` is True) of all trainable
+            (tf)/requires_grad (torch) variables of this ModelV2.
+        """
+        raise NotImplementedError
+
+    def is_time_major(self) -> bool:
+        """If True, data for calling this ModelV2 must be in time-major format.
+
+        Returns
+            Whether this ModelV2 requires a time-major (TxBx...) data
+            format.
+        """
+        return self.time_major is True
+
+    @Deprecated(error=True)
+    def import_from_h5(self, *args, **kwargs):
+        pass
+
+
+@OldAPIStack
+def flatten(obs: TensorType, framework: str) -> TensorType:
+    """Flatten the given tensor."""
+    if framework in ["tf2", "tf"]:
+        return tf1.keras.layers.Flatten()(obs)
+    elif framework == "torch":
+        assert torch is not None
+        return torch.flatten(obs, start_dim=1)
+    else:
+        raise NotImplementedError("flatten", framework)
+
+
+@OldAPIStack
+def restore_original_dimensions(
+    obs: TensorType, obs_space: Space, tensorlib: Any = tf
+) -> TensorStructType:
+    """Unpacks Dict and Tuple space observations into their original form.
+
+    This is needed since we flatten Dict and Tuple observations in transit
+    within a SampleBatch. Before sending them to the model though, we should
+    unflatten them into Dicts or Tuples of tensors.
+
+    Args:
+        obs: The flattened observation tensor.
+        obs_space: The flattened obs space. If this has the
+            `original_space` attribute, we will unflatten the tensor to that
+            shape.
+        tensorlib: The library used to unflatten (reshape) the array/tensor.
+
+    Returns:
+        single tensor or dict / tuple of tensors matching the original
+        observation space.
+    """
+
+    if tensorlib in ["tf", "tf2"]:
+        assert tf is not None
+        tensorlib = tf
+    elif tensorlib == "torch":
+        assert torch is not None
+        tensorlib = torch
+    elif tensorlib == "numpy":
+        assert np is not None
+        tensorlib = np
+    original_space = getattr(obs_space, "original_space", obs_space)
+    return _unpack_obs(obs, original_space, tensorlib=tensorlib)
+
+
+# Cache of preprocessors, for if the user is calling unpack obs often.
+_cache = {}
+
+
+@OldAPIStack
+def _unpack_obs(obs: TensorType, space: Space, tensorlib: Any = tf) -> TensorStructType:
+    """Unpack a flattened Dict or Tuple observation array/tensor.
+
+    Args:
+        obs: The flattened observation tensor, with last dimension equal to
+            the flat size and any number of batch dimensions. For example, for
+            Box(4,), the obs may have shape [B, 4], or [B, N, M, 4] in case
+            the Box was nested under two Repeated spaces.
+        space: The original space prior to flattening
+        tensorlib: The library used to unflatten (reshape) the array/tensor
+    """
+
+    if isinstance(space, (gym.spaces.Dict, gym.spaces.Tuple, Repeated)):
+        # Already unpacked?
+        if (isinstance(space, gym.spaces.Tuple) and isinstance(obs, (list, tuple))) or (
+            isinstance(space, gym.spaces.Dict) and isinstance(obs, dict)
+        ):
+            return obs
+        # Unpack using preprocessor
+        if id(space) in _cache:
+            prep = _cache[id(space)]
+        else:
+            prep = get_preprocessor(space)(space)
+            # Make an attempt to cache the result, if enough space left.
+            if len(_cache) < 999:
+                _cache[id(space)] = prep
+        if len(obs.shape) < 2 or obs.shape[-1] != prep.shape[0]:
+            raise ValueError(
+                "Expected flattened obs shape of [..., {}], got {}".format(
+                    prep.shape[0], obs.shape
+                )
+            )
+        offset = 0
+        if tensorlib == tf:
+
+            def get_value(v):
+                if v is None:
+                    return -1
+                elif isinstance(v, int):
+                    return v
+                elif v.value is None:
+                    return -1
+                else:
+                    return v.value
+
+            batch_dims = [get_value(v) for v in obs.shape[:-1]]
+        else:
+            batch_dims = list(obs.shape[:-1])
+        if isinstance(space, gym.spaces.Tuple):
+            assert len(prep.preprocessors) == len(space.spaces), len(
+                prep.preprocessors
+            ) == len(space.spaces)
+            u = []
+            for p, v in zip(prep.preprocessors, space.spaces):
+                obs_slice = obs[..., offset : offset + p.size]
+                offset += p.size
+                u.append(
+                    _unpack_obs(
+                        tensorlib.reshape(obs_slice, batch_dims + list(p.shape)),
+                        v,
+                        tensorlib=tensorlib,
+                    )
+                )
+        elif isinstance(space, gym.spaces.Dict):
+            assert len(prep.preprocessors) == len(space.spaces), len(
+                prep.preprocessors
+            ) == len(space.spaces)
+            u = OrderedDict()
+            for p, (k, v) in zip(prep.preprocessors, space.spaces.items()):
+                obs_slice = obs[..., offset : offset + p.size]
+                offset += p.size
+                u[k] = _unpack_obs(
+                    tensorlib.reshape(obs_slice, batch_dims + list(p.shape)),
+                    v,
+                    tensorlib=tensorlib,
+                )
+        # Repeated space.
+        else:
+            assert isinstance(prep, RepeatedValuesPreprocessor), prep
+            child_size = prep.child_preprocessor.size
+            # The list lengths are stored in the first slot of the flat obs.
+            lengths = obs[..., 0]
+            # [B, ..., 1 + max_len * child_sz] -> [B, ..., max_len, child_sz]
+            with_repeat_dim = tensorlib.reshape(
+                obs[..., 1:], batch_dims + [space.max_len, child_size]
+            )
+            # Retry the unpack, dropping the List container space.
+            u = _unpack_obs(with_repeat_dim, space.child_space, tensorlib=tensorlib)
+            return RepeatedValues(u, lengths=lengths, max_len=prep._obs_space.max_len)
+        return u
+    else:
+        return obs
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/preprocessors.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/preprocessors.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad15d0c155124e38ba7fb8c84b162b7a564d4c9a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/preprocessors.py
@@ -0,0 +1,447 @@
+from collections import OrderedDict
+import logging
+import numpy as np
+import gymnasium as gym
+from typing import Any, List
+
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.spaces.repeated import Repeated
+from ray.rllib.utils.typing import TensorType
+from ray.rllib.utils.images import resize
+from ray.rllib.utils.spaces.space_utils import convert_element_to_space_type
+
+ATARI_OBS_SHAPE = (210, 160, 3)
+ATARI_RAM_OBS_SHAPE = (128,)
+
+# Only validate env observations vs the observation space every n times in a
+# Preprocessor.
+OBS_VALIDATION_INTERVAL = 100
+
+logger = logging.getLogger(__name__)
+
+
+@OldAPIStack
+class Preprocessor:
+    """Defines an abstract observation preprocessor function.
+
+    Attributes:
+        shape (List[int]): Shape of the preprocessed output.
+    """
+
+    def __init__(self, obs_space: gym.Space, options: dict = None):
+        _legacy_patch_shapes(obs_space)
+        self._obs_space = obs_space
+        if not options:
+            from ray.rllib.models.catalog import MODEL_DEFAULTS
+
+            self._options = MODEL_DEFAULTS.copy()
+        else:
+            self._options = options
+        self.shape = self._init_shape(obs_space, self._options)
+        self._size = int(np.prod(self.shape))
+        self._i = 0
+        self._obs_for_type_matching = self._obs_space.sample()
+
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        """Returns the shape after preprocessing."""
+        raise NotImplementedError
+
+    def transform(self, observation: TensorType) -> np.ndarray:
+        """Returns the preprocessed observation."""
+        raise NotImplementedError
+
+    def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None:
+        """Alternative to transform for more efficient flattening."""
+        array[offset : offset + self._size] = self.transform(observation)
+
+    def check_shape(self, observation: Any) -> None:
+        """Checks the shape of the given observation."""
+        if self._i % OBS_VALIDATION_INTERVAL == 0:
+            # Convert lists to np.ndarrays.
+            if type(observation) is list and isinstance(
+                self._obs_space, gym.spaces.Box
+            ):
+                observation = np.array(observation).astype(np.float32)
+            if not self._obs_space.contains(observation):
+                observation = convert_element_to_space_type(
+                    observation, self._obs_for_type_matching
+                )
+            try:
+                if not self._obs_space.contains(observation):
+                    raise ValueError(
+                        "Observation ({} dtype={}) outside given space ({})!".format(
+                            observation,
+                            observation.dtype
+                            if isinstance(self._obs_space, gym.spaces.Box)
+                            else None,
+                            self._obs_space,
+                        )
+                    )
+            except AttributeError as e:
+                raise ValueError(
+                    "Observation for a Box/MultiBinary/MultiDiscrete space "
+                    "should be an np.array, not a Python list.",
+                    observation,
+                ) from e
+        self._i += 1
+
+    @property
+    def size(self) -> int:
+        return self._size
+
+    @property
+    def observation_space(self) -> gym.Space:
+        obs_space = gym.spaces.Box(-1.0, 1.0, self.shape, dtype=np.float32)
+        # Stash the unwrapped space so that we can unwrap dict and tuple spaces
+        # automatically in modelv2.py
+        classes = (
+            DictFlatteningPreprocessor,
+            OneHotPreprocessor,
+            RepeatedValuesPreprocessor,
+            TupleFlatteningPreprocessor,
+            AtariRamPreprocessor,
+            GenericPixelPreprocessor,
+        )
+        if isinstance(self, classes):
+            obs_space.original_space = self._obs_space
+        return obs_space
+
+
+@OldAPIStack
+class GenericPixelPreprocessor(Preprocessor):
+    """Generic image preprocessor.
+
+    Note: for Atari games, use config {"preprocessor_pref": "deepmind"}
+    instead for deepmind-style Atari preprocessing.
+    """
+
+    @override(Preprocessor)
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        self._grayscale = options.get("grayscale")
+        self._zero_mean = options.get("zero_mean")
+        self._dim = options.get("dim")
+        if self._grayscale:
+            shape = (self._dim, self._dim, 1)
+        else:
+            shape = (self._dim, self._dim, 3)
+
+        return shape
+
+    @override(Preprocessor)
+    def transform(self, observation: TensorType) -> np.ndarray:
+        """Downsamples images from (210, 160, 3) by the configured factor."""
+        self.check_shape(observation)
+        scaled = observation[25:-25, :, :]
+        if self._dim < 84:
+            scaled = resize(scaled, height=84, width=84)
+        # OpenAI: Resize by half, then down to 42x42 (essentially mipmapping).
+        # If we resize directly we lose pixels that, when mapped to 42x42,
+        # aren't close enough to the pixel boundary.
+        scaled = resize(scaled, height=self._dim, width=self._dim)
+        if self._grayscale:
+            scaled = scaled.mean(2)
+            scaled = scaled.astype(np.float32)
+            # Rescale needed for maintaining 1 channel
+            scaled = np.reshape(scaled, [self._dim, self._dim, 1])
+        if self._zero_mean:
+            scaled = (scaled - 128) / 128
+        else:
+            scaled *= 1.0 / 255.0
+        return scaled
+
+
+@OldAPIStack
+class AtariRamPreprocessor(Preprocessor):
+    @override(Preprocessor)
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        return (128,)
+
+    @override(Preprocessor)
+    def transform(self, observation: TensorType) -> np.ndarray:
+        self.check_shape(observation)
+        return (observation.astype("float32") - 128) / 128
+
+
+@OldAPIStack
+class OneHotPreprocessor(Preprocessor):
+    """One-hot preprocessor for Discrete and MultiDiscrete spaces.
+
+    .. testcode::
+        :skipif: True
+
+        self.transform(Discrete(3).sample())
+
+    .. testoutput::
+
+        np.array([0.0, 1.0, 0.0])
+
+    .. testcode::
+        :skipif: True
+
+        self.transform(MultiDiscrete([2, 3]).sample())
+
+    .. testoutput::
+
+        np.array([0.0, 1.0, 0.0, 0.0, 1.0])
+    """
+
+    @override(Preprocessor)
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        if isinstance(obs_space, gym.spaces.Discrete):
+            return (self._obs_space.n,)
+        else:
+            return (np.sum(self._obs_space.nvec),)
+
+    @override(Preprocessor)
+    def transform(self, observation: TensorType) -> np.ndarray:
+        self.check_shape(observation)
+        return gym.spaces.utils.flatten(self._obs_space, observation).astype(np.float32)
+
+    @override(Preprocessor)
+    def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None:
+        array[offset : offset + self.size] = self.transform(observation)
+
+
+@OldAPIStack
+class NoPreprocessor(Preprocessor):
+    @override(Preprocessor)
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        return self._obs_space.shape
+
+    @override(Preprocessor)
+    def transform(self, observation: TensorType) -> np.ndarray:
+        self.check_shape(observation)
+        return observation
+
+    @override(Preprocessor)
+    def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None:
+        array[offset : offset + self._size] = np.array(observation, copy=False).ravel()
+
+    @property
+    @override(Preprocessor)
+    def observation_space(self) -> gym.Space:
+        return self._obs_space
+
+
+@OldAPIStack
+class MultiBinaryPreprocessor(Preprocessor):
+    """Preprocessor that turns a MultiBinary space into a Box.
+
+    Note: Before RLModules were introduced, RLlib's ModelCatalogV2 would produce
+    ComplexInputNetworks that treat MultiBinary spaces as Boxes. This preprocessor is
+    needed to get rid of the ComplexInputNetworks and use RLModules instead because
+    RLModules lack the logic to handle MultiBinary or other non-Box spaces.
+    """
+
+    @override(Preprocessor)
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        return self._obs_space.shape
+
+    @override(Preprocessor)
+    def transform(self, observation: TensorType) -> np.ndarray:
+        # The shape stays the same, but the dtype changes.
+        self.check_shape(observation)
+        return observation.astype(np.float32)
+
+    @override(Preprocessor)
+    def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None:
+        array[offset : offset + self._size] = np.array(observation, copy=False).ravel()
+
+    @property
+    @override(Preprocessor)
+    def observation_space(self) -> gym.Space:
+        obs_space = gym.spaces.Box(0.0, 1.0, self.shape, dtype=np.float32)
+        obs_space.original_space = self._obs_space
+        return obs_space
+
+
+@OldAPIStack
+class TupleFlatteningPreprocessor(Preprocessor):
+    """Preprocesses each tuple element, then flattens it all into a vector.
+
+    RLlib models will unpack the flattened output before _build_layers_v2().
+    """
+
+    @override(Preprocessor)
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        assert isinstance(self._obs_space, gym.spaces.Tuple)
+        size = 0
+        self.preprocessors = []
+        for i in range(len(self._obs_space.spaces)):
+            space = self._obs_space.spaces[i]
+            logger.debug("Creating sub-preprocessor for {}".format(space))
+            preprocessor_class = get_preprocessor(space)
+            if preprocessor_class is not None:
+                preprocessor = preprocessor_class(space, self._options)
+                size += preprocessor.size
+            else:
+                preprocessor = None
+                size += int(np.prod(space.shape))
+            self.preprocessors.append(preprocessor)
+        return (size,)
+
+    @override(Preprocessor)
+    def transform(self, observation: TensorType) -> np.ndarray:
+        self.check_shape(observation)
+        array = np.zeros(self.shape, dtype=np.float32)
+        self.write(observation, array, 0)
+        return array
+
+    @override(Preprocessor)
+    def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None:
+        assert len(observation) == len(self.preprocessors), observation
+        for o, p in zip(observation, self.preprocessors):
+            p.write(o, array, offset)
+            offset += p.size
+
+
+@OldAPIStack
+class DictFlatteningPreprocessor(Preprocessor):
+    """Preprocesses each dict value, then flattens it all into a vector.
+
+    RLlib models will unpack the flattened output before _build_layers_v2().
+    """
+
+    @override(Preprocessor)
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        assert isinstance(self._obs_space, gym.spaces.Dict)
+        size = 0
+        self.preprocessors = []
+        for space in self._obs_space.spaces.values():
+            logger.debug("Creating sub-preprocessor for {}".format(space))
+            preprocessor_class = get_preprocessor(space)
+            if preprocessor_class is not None:
+                preprocessor = preprocessor_class(space, self._options)
+                size += preprocessor.size
+            else:
+                preprocessor = None
+                size += int(np.prod(space.shape))
+            self.preprocessors.append(preprocessor)
+        return (size,)
+
+    @override(Preprocessor)
+    def transform(self, observation: TensorType) -> np.ndarray:
+        self.check_shape(observation)
+        array = np.zeros(self.shape, dtype=np.float32)
+        self.write(observation, array, 0)
+        return array
+
+    @override(Preprocessor)
+    def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None:
+        if not isinstance(observation, OrderedDict):
+            observation = OrderedDict(sorted(observation.items()))
+        assert len(observation) == len(self.preprocessors), (
+            len(observation),
+            len(self.preprocessors),
+        )
+        for o, p in zip(observation.values(), self.preprocessors):
+            p.write(o, array, offset)
+            offset += p.size
+
+
+@OldAPIStack
+class RepeatedValuesPreprocessor(Preprocessor):
+    """Pads and batches the variable-length list value."""
+
+    @override(Preprocessor)
+    def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]:
+        assert isinstance(self._obs_space, Repeated)
+        child_space = obs_space.child_space
+        self.child_preprocessor = get_preprocessor(child_space)(
+            child_space, self._options
+        )
+        # The first slot encodes the list length.
+        size = 1 + self.child_preprocessor.size * obs_space.max_len
+        return (size,)
+
+    @override(Preprocessor)
+    def transform(self, observation: TensorType) -> np.ndarray:
+        array = np.zeros(self.shape)
+        if isinstance(observation, list):
+            for elem in observation:
+                self.child_preprocessor.check_shape(elem)
+        else:
+            pass  # ValueError will be raised in write() below.
+        self.write(observation, array, 0)
+        return array
+
+    @override(Preprocessor)
+    def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None:
+        if not isinstance(observation, (list, np.ndarray)):
+            raise ValueError(
+                "Input for {} must be list type, got {}".format(self, observation)
+            )
+        elif len(observation) > self._obs_space.max_len:
+            raise ValueError(
+                "Input {} exceeds max len of space {}".format(
+                    observation, self._obs_space.max_len
+                )
+            )
+        # The first slot encodes the list length.
+        array[offset] = len(observation)
+        for i, elem in enumerate(observation):
+            offset_i = offset + 1 + i * self.child_preprocessor.size
+            self.child_preprocessor.write(elem, array, offset_i)
+
+
+@OldAPIStack
+def get_preprocessor(space: gym.Space, include_multi_binary=False) -> type:
+    """Returns an appropriate preprocessor class for the given space."""
+
+    _legacy_patch_shapes(space)
+    obs_shape = space.shape
+
+    if isinstance(space, (gym.spaces.Discrete, gym.spaces.MultiDiscrete)):
+        preprocessor = OneHotPreprocessor
+    elif obs_shape == ATARI_OBS_SHAPE:
+        logger.debug(
+            "Defaulting to RLlib's GenericPixelPreprocessor because input "
+            "space has the atari-typical shape {}. Turn this behaviour off by setting "
+            "`preprocessor_pref=None` or "
+            "`preprocessor_pref='deepmind'` or disabling the preprocessing API "
+            "altogether with `_disable_preprocessor_api=True`.".format(ATARI_OBS_SHAPE)
+        )
+        preprocessor = GenericPixelPreprocessor
+    elif obs_shape == ATARI_RAM_OBS_SHAPE:
+        logger.debug(
+            "Defaulting to RLlib's AtariRamPreprocessor because input "
+            "space has the atari-typical shape {}. Turn this behaviour off by setting "
+            "`preprocessor_pref=None` or "
+            "`preprocessor_pref='deepmind' or disabling the preprocessing API "
+            "altogether with `_disable_preprocessor_api=True`."
+            "`.".format(ATARI_OBS_SHAPE)
+        )
+        preprocessor = AtariRamPreprocessor
+    elif isinstance(space, gym.spaces.Tuple):
+        preprocessor = TupleFlatteningPreprocessor
+    elif isinstance(space, gym.spaces.Dict):
+        preprocessor = DictFlatteningPreprocessor
+    elif isinstance(space, Repeated):
+        preprocessor = RepeatedValuesPreprocessor
+    # We usually only want to include this when using RLModules
+    elif isinstance(space, gym.spaces.MultiBinary) and include_multi_binary:
+        preprocessor = MultiBinaryPreprocessor
+    else:
+        preprocessor = NoPreprocessor
+
+    return preprocessor
+
+
+def _legacy_patch_shapes(space: gym.Space) -> List[int]:
+    """Assigns shapes to spaces that don't have shapes.
+
+    This is only needed for older gym versions that don't set shapes properly
+    for Tuple and Discrete spaces.
+    """
+
+    if not hasattr(space, "shape"):
+        if isinstance(space, gym.spaces.Discrete):
+            space.shape = ()
+        elif isinstance(space, gym.spaces.Tuple):
+            shapes = []
+            for s in space.spaces:
+                shape = _legacy_patch_shapes(s)
+                shapes.append(shape)
+            space.shape = tuple(shapes)
+
+    return space.shape
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/repeated_values.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/repeated_values.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ecef777f667bffec0008bb0b1fdf1aa7c88d52e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/repeated_values.py
@@ -0,0 +1,204 @@
+from typing import List
+
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.typing import TensorType, TensorStructType
+
+
+@OldAPIStack
+class RepeatedValues:
+    """Represents a variable-length list of items from spaces.Repeated.
+
+    RepeatedValues are created when you use spaces.Repeated, and are
+    accessible as part of input_dict["obs"] in ModelV2 forward functions.
+
+    Example:
+        Suppose the gym space definition was:
+            Repeated(Repeated(Box(K), N), M)
+
+        Then in the model forward function, input_dict["obs"] is of type:
+            RepeatedValues(RepeatedValues(<Tensor shape=(B, M, N, K)>))
+
+        The tensor is accessible via:
+            input_dict["obs"].values.values
+
+        And the actual data lengths via:
+            # outer repetition, shape [B], range [0, M]
+            input_dict["obs"].lengths
+                -and-
+            # inner repetition, shape [B, M], range [0, N]
+            input_dict["obs"].values.lengths
+
+    Attributes:
+        values: The padded data tensor of shape [B, max_len, ..., sz],
+            where B is the batch dimension, max_len is the max length of this
+            list, followed by any number of sub list max lens, followed by the
+            actual data size.
+        lengths (List[int]): Tensor of shape [B, ...] that represents the
+            number of valid items in each list. When the list is nested within
+            other lists, there will be extra dimensions for the parent list
+            max lens.
+        max_len: The max number of items allowed in each list.
+
+    TODO(ekl): support conversion to tf.RaggedTensor.
+    """
+
+    def __init__(self, values: TensorType, lengths: List[int], max_len: int):
+        self.values = values
+        self.lengths = lengths
+        self.max_len = max_len
+        self._unbatched_repr = None
+
+    def unbatch_all(self) -> List[List[TensorType]]:
+        """Unbatch both the repeat and batch dimensions into Python lists.
+
+        This is only supported in PyTorch / TF eager mode.
+
+        This lets you view the data unbatched in its original form, but is
+        not efficient for processing.
+
+        .. testcode::
+            :skipif: True
+
+            batch = RepeatedValues(<Tensor shape=(B, N, K)>)
+            items = batch.unbatch_all()
+            print(len(items) == B)
+
+        .. testoutput::
+
+            True
+
+        .. testcode::
+            :skipif: True
+
+            print(max(len(x) for x in items) <= N)
+
+        .. testoutput::
+
+            True
+
+        .. testcode::
+            :skipif: True
+
+            print(items)
+
+        .. testoutput::
+
+            [[<Tensor_1 shape=(K)>, ..., <Tensor_N, shape=(K)>],
+             ...
+             [<Tensor_1 shape=(K)>, <Tensor_2 shape=(K)>],
+             ...
+             [<Tensor_1 shape=(K)>],
+             ...
+             [<Tensor_1 shape=(K)>, ..., <Tensor_N shape=(K)>]]
+        """
+
+        if self._unbatched_repr is None:
+            B = _get_batch_dim_helper(self.values)
+            if B is None:
+                raise ValueError(
+                    "Cannot call unbatch_all() when batch_dim is unknown. "
+                    "This is probably because you are using TF graph mode."
+                )
+            else:
+                B = int(B)
+            slices = self.unbatch_repeat_dim()
+            result = []
+            for i in range(B):
+                if hasattr(self.lengths[i], "item"):
+                    dynamic_len = int(self.lengths[i].item())
+                else:
+                    dynamic_len = int(self.lengths[i].numpy())
+                dynamic_slice = []
+                for j in range(dynamic_len):
+                    dynamic_slice.append(_batch_index_helper(slices, i, j))
+                result.append(dynamic_slice)
+            self._unbatched_repr = result
+
+        return self._unbatched_repr
+
+    def unbatch_repeat_dim(self) -> List[TensorType]:
+        """Unbatches the repeat dimension (the one `max_len` in size).
+
+        This removes the repeat dimension. The result will be a Python list of
+        with length `self.max_len`. Note that the data is still padded.
+
+        .. testcode::
+            :skipif: True
+
+            batch = RepeatedValues(<Tensor shape=(B, N, K)>)
+            items = batch.unbatch()
+            len(items) == batch.max_len
+
+        .. testoutput::
+
+            True
+
+        .. testcode::
+            :skipif: True
+
+            print(items)
+
+        .. testoutput::
+
+            [<Tensor_1 shape=(B, K)>, ..., <Tensor_N shape=(B, K)>]
+        """
+        return _unbatch_helper(self.values, self.max_len)
+
+    def __repr__(self):
+        return "RepeatedValues(value={}, lengths={}, max_len={})".format(
+            repr(self.values), repr(self.lengths), self.max_len
+        )
+
+    def __str__(self):
+        return repr(self)
+
+
+def _get_batch_dim_helper(v: TensorStructType) -> int:
+    """Tries to find the batch dimension size of v, or None."""
+    if isinstance(v, dict):
+        for u in v.values():
+            return _get_batch_dim_helper(u)
+    elif isinstance(v, tuple):
+        return _get_batch_dim_helper(v[0])
+    elif isinstance(v, RepeatedValues):
+        return _get_batch_dim_helper(v.values)
+    else:
+        B = v.shape[0]
+        if hasattr(B, "value"):
+            B = B.value  # TensorFlow
+        return B
+
+
+def _unbatch_helper(v: TensorStructType, max_len: int) -> TensorStructType:
+    """Recursively unpacks the repeat dimension (max_len)."""
+    if isinstance(v, dict):
+        return {k: _unbatch_helper(u, max_len) for (k, u) in v.items()}
+    elif isinstance(v, tuple):
+        return tuple(_unbatch_helper(u, max_len) for u in v)
+    elif isinstance(v, RepeatedValues):
+        unbatched = _unbatch_helper(v.values, max_len)
+        return [
+            RepeatedValues(u, v.lengths[:, i, ...], v.max_len)
+            for i, u in enumerate(unbatched)
+        ]
+    else:
+        return [v[:, i, ...] for i in range(max_len)]
+
+
+def _batch_index_helper(v: TensorStructType, i: int, j: int) -> TensorStructType:
+    """Selects the item at the ith batch index and jth repetition."""
+    if isinstance(v, dict):
+        return {k: _batch_index_helper(u, i, j) for (k, u) in v.items()}
+    elif isinstance(v, tuple):
+        return tuple(_batch_index_helper(u, i, j) for u in v)
+    elif isinstance(v, list):
+        # This is the output of unbatch_repeat_dim(). Unfortunately we have to
+        # process it here instead of in unbatch_all(), since it may be buried
+        # under a dict / tuple.
+        return _batch_index_helper(v[j], i, j)
+    elif isinstance(v, RepeatedValues):
+        unbatched = v.unbatch_all()
+        # Don't need to select j here; that's already done in unbatch_all.
+        return unbatched[i]
+    else:
+        return v[i, ...]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d33b39d455bcd6f43da151586444df41711b1c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__init__.py
@@ -0,0 +1,11 @@
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
+from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
+from ray.rllib.models.tf.visionnet import VisionNetwork
+
+__all__ = [
+    "FullyConnectedNetwork",
+    "RecurrentNetwork",
+    "TFModelV2",
+    "VisionNetwork",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/complex_input_net.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/complex_input_net.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ba6ff259ea095560cb79cc7d965365700583849
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/complex_input_net.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/misc.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/misc.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..477cb69891b019020d1dbcb8c66ac4b3d751387f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/__pycache__/misc.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/attention_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/attention_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..886580fce177a0e075eb2d252ef869e181f5ae1b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/attention_net.py
@@ -0,0 +1,573 @@
+"""
+[1] - Attention Is All You Need - Vaswani, Jones, Shazeer, Parmar,
+      Uszkoreit, Gomez, Kaiser - Google Brain/Research, U Toronto - 2017.
+      https://arxiv.org/pdf/1706.03762.pdf
+[2] - Stabilizing Transformers for Reinforcement Learning - E. Parisotto
+      et al. - DeepMind - 2019. https://arxiv.org/pdf/1910.06764.pdf
+[3] - Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context.
+      Z. Dai, Z. Yang, et al. - Carnegie Mellon U - 2019.
+      https://www.aclweb.org/anthology/P19-1285.pdf
+"""
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete, MultiDiscrete
+import numpy as np
+import tree  # pip install dm_tree
+from typing import Any, Dict, Optional, Union
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.layers import (
+    GRUGate,
+    RelativeMultiHeadAttention,
+    SkipConnection,
+)
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor, one_hot
+from ray.rllib.utils.typing import ModelConfigDict, TensorType, List
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util import log_once
+
+tf1, tf, tfv = try_import_tf()
+
+
+@OldAPIStack
+class PositionwiseFeedforward(tf.keras.layers.Layer if tf else object):
+    """A 2x linear layer with ReLU activation in between described in [1].
+
+    Each timestep coming from the attention head will be passed through this
+    layer separately.
+    """
+
+    def __init__(
+        self,
+        out_dim: int,
+        hidden_dim: int,
+        output_activation: Optional[Any] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self._hidden_layer = tf.keras.layers.Dense(
+            hidden_dim,
+            activation=tf.nn.relu,
+        )
+
+        self._output_layer = tf.keras.layers.Dense(
+            out_dim, activation=output_activation
+        )
+        if log_once("positionwise_feedforward_tf"):
+            deprecation_warning(
+                old="rllib.models.tf.attention_net.PositionwiseFeedforward",
+            )
+
+    def call(self, inputs: TensorType, **kwargs) -> TensorType:
+        del kwargs
+        output = self._hidden_layer(inputs)
+        return self._output_layer(output)
+
+
+@OldAPIStack
+class TrXLNet(RecurrentNetwork):
+    """A TrXL net Model described in [1]."""
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+        num_transformer_units: int,
+        attention_dim: int,
+        num_heads: int,
+        head_dim: int,
+        position_wise_mlp_dim: int,
+    ):
+        """Initializes a TrXLNet object.
+
+        Args:
+            num_transformer_units: The number of Transformer repeats to
+                use (denoted L in [2]).
+            attention_dim: The input and output dimensions of one
+                Transformer unit.
+            num_heads: The number of attention heads to use in parallel.
+                Denoted as `H` in [3].
+            head_dim: The dimension of a single(!) attention head within
+                a multi-head attention unit. Denoted as `d` in [3].
+            position_wise_mlp_dim: The dimension of the hidden layer
+                within the position-wise MLP (after the multi-head attention
+                block within one Transformer unit). This is the size of the
+                first of the two layers within the PositionwiseFeedforward. The
+                second layer always has size=`attention_dim`.
+        """
+        if log_once("trxl_net_tf"):
+            deprecation_warning(
+                old="rllib.models.tf.attention_net.TrXLNet",
+            )
+        super().__init__(
+            observation_space, action_space, num_outputs, model_config, name
+        )
+
+        self.num_transformer_units = num_transformer_units
+        self.attention_dim = attention_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.max_seq_len = model_config["max_seq_len"]
+        self.obs_dim = observation_space.shape[0]
+
+        inputs = tf.keras.layers.Input(
+            shape=(self.max_seq_len, self.obs_dim), name="inputs"
+        )
+        E_out = tf.keras.layers.Dense(attention_dim)(inputs)
+
+        for _ in range(self.num_transformer_units):
+            MHA_out = SkipConnection(
+                RelativeMultiHeadAttention(
+                    out_dim=attention_dim,
+                    num_heads=num_heads,
+                    head_dim=head_dim,
+                    input_layernorm=False,
+                    output_activation=None,
+                ),
+                fan_in_layer=None,
+            )(E_out)
+            E_out = SkipConnection(
+                PositionwiseFeedforward(attention_dim, position_wise_mlp_dim)
+            )(MHA_out)
+            E_out = tf.keras.layers.LayerNormalization(axis=-1)(E_out)
+
+        # Postprocess TrXL output with another hidden layer and compute values.
+        logits = tf.keras.layers.Dense(
+            self.num_outputs, activation=tf.keras.activations.linear, name="logits"
+        )(E_out)
+
+        self.base_model = tf.keras.models.Model([inputs], [logits])
+
+    @override(RecurrentNetwork)
+    def forward_rnn(
+        self, inputs: TensorType, state: List[TensorType], seq_lens: TensorType
+    ) -> (TensorType, List[TensorType]):
+        # To make Attention work with current RLlib's ModelV2 API:
+        # We assume `state` is the history of L recent observations (all
+        # concatenated into one tensor) and append the current inputs to the
+        # end and only keep the most recent (up to `max_seq_len`). This allows
+        # us to deal with timestep-wise inference and full sequence training
+        # within the same logic.
+        observations = state[0]
+        observations = tf.concat((observations, inputs), axis=1)[:, -self.max_seq_len :]
+        logits = self.base_model([observations])
+        T = tf.shape(inputs)[1]  # Length of input segment (time).
+        logits = logits[:, -T:]
+
+        return logits, [observations]
+
+    @override(RecurrentNetwork)
+    def get_initial_state(self) -> List[np.ndarray]:
+        # State is the T last observations concat'd together into one Tensor.
+        # Plus all Transformer blocks' E(l) outputs concat'd together (up to
+        # tau timesteps).
+        return [np.zeros((self.max_seq_len, self.obs_dim), np.float32)]
+
+
+class GTrXLNet(RecurrentNetwork):
+    """A GTrXL net Model described in [2].
+
+    This is still in an experimental phase.
+    Can be used as a drop-in replacement for LSTMs in PPO and IMPALA.
+
+    To use this network as a replacement for an RNN, configure your Algorithm
+    as follows:
+
+    Examples:
+        >> config["model"]["custom_model"] = GTrXLNet
+        >> config["model"]["max_seq_len"] = 10
+        >> config["model"]["custom_model_config"] = {
+        >>     num_transformer_units=1,
+        >>     attention_dim=32,
+        >>     num_heads=2,
+        >>     memory_inference=100,
+        >>     memory_training=50,
+        >>     etc..
+        >> }
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: Optional[int],
+        model_config: ModelConfigDict,
+        name: str,
+        *,
+        num_transformer_units: int = 1,
+        attention_dim: int = 64,
+        num_heads: int = 2,
+        memory_inference: int = 50,
+        memory_training: int = 50,
+        head_dim: int = 32,
+        position_wise_mlp_dim: int = 32,
+        init_gru_gate_bias: float = 2.0,
+    ):
+        """Initializes a GTrXLNet instance.
+
+        Args:
+            num_transformer_units: The number of Transformer repeats to
+                use (denoted L in [2]).
+            attention_dim: The input and output dimensions of one
+                Transformer unit.
+            num_heads: The number of attention heads to use in parallel.
+                Denoted as `H` in [3].
+            memory_inference: The number of timesteps to concat (time
+                axis) and feed into the next transformer unit as inference
+                input. The first transformer unit will receive this number of
+                past observations (plus the current one), instead.
+            memory_training: The number of timesteps to concat (time
+                axis) and feed into the next transformer unit as training
+                input (plus the actual input sequence of len=max_seq_len).
+                The first transformer unit will receive this number of
+                past observations (plus the input sequence), instead.
+            head_dim: The dimension of a single(!) attention head within
+                a multi-head attention unit. Denoted as `d` in [3].
+            position_wise_mlp_dim: The dimension of the hidden layer
+                within the position-wise MLP (after the multi-head attention
+                block within one Transformer unit). This is the size of the
+                first of the two layers within the PositionwiseFeedforward. The
+                second layer always has size=`attention_dim`.
+            init_gru_gate_bias: Initial bias values for the GRU gates
+                (two GRUs per Transformer unit, one after the MHA, one after
+                the position-wise MLP).
+        """
+        super().__init__(
+            observation_space, action_space, num_outputs, model_config, name
+        )
+
+        self.num_transformer_units = num_transformer_units
+        self.attention_dim = attention_dim
+        self.num_heads = num_heads
+        self.memory_inference = memory_inference
+        self.memory_training = memory_training
+        self.head_dim = head_dim
+        self.max_seq_len = model_config["max_seq_len"]
+        self.obs_dim = observation_space.shape[0]
+
+        # Raw observation input (plus (None) time axis).
+        input_layer = tf.keras.layers.Input(shape=(None, self.obs_dim), name="inputs")
+        memory_ins = [
+            tf.keras.layers.Input(
+                shape=(None, self.attention_dim),
+                dtype=tf.float32,
+                name="memory_in_{}".format(i),
+            )
+            for i in range(self.num_transformer_units)
+        ]
+
+        # Map observation dim to input/output transformer (attention) dim.
+        E_out = tf.keras.layers.Dense(self.attention_dim)(input_layer)
+        # Output, collected and concat'd to build the internal, tau-len
+        # Memory units used for additional contextual information.
+        memory_outs = [E_out]
+
+        # 2) Create L Transformer blocks according to [2].
+        for i in range(self.num_transformer_units):
+            # RelativeMultiHeadAttention part.
+            MHA_out = SkipConnection(
+                RelativeMultiHeadAttention(
+                    out_dim=self.attention_dim,
+                    num_heads=num_heads,
+                    head_dim=head_dim,
+                    input_layernorm=True,
+                    output_activation=tf.nn.relu,
+                ),
+                fan_in_layer=GRUGate(init_gru_gate_bias),
+                name="mha_{}".format(i + 1),
+            )(E_out, memory=memory_ins[i])
+            # Position-wise MLP part.
+            E_out = SkipConnection(
+                tf.keras.Sequential(
+                    (
+                        tf.keras.layers.LayerNormalization(axis=-1),
+                        PositionwiseFeedforward(
+                            out_dim=self.attention_dim,
+                            hidden_dim=position_wise_mlp_dim,
+                            output_activation=tf.nn.relu,
+                        ),
+                    )
+                ),
+                fan_in_layer=GRUGate(init_gru_gate_bias),
+                name="pos_wise_mlp_{}".format(i + 1),
+            )(MHA_out)
+            # Output of position-wise MLP == E(l-1), which is concat'd
+            # to the current Mem block (M(l-1)) to yield E~(l-1), which is then
+            # used by the next transformer block.
+            memory_outs.append(E_out)
+
+        self._logits = None
+        self._value_out = None
+
+        # Postprocess TrXL output with another hidden layer and compute values.
+        if num_outputs is not None:
+            self._logits = tf.keras.layers.Dense(
+                self.num_outputs, activation=None, name="logits"
+            )(E_out)
+            values_out = tf.keras.layers.Dense(1, activation=None, name="values")(E_out)
+            outs = [self._logits, values_out]
+        else:
+            outs = [E_out]
+            self.num_outputs = self.attention_dim
+
+        self.trxl_model = tf.keras.Model(
+            inputs=[input_layer] + memory_ins, outputs=outs + memory_outs[:-1]
+        )
+
+        self.trxl_model.summary()
+
+        # __sphinx_doc_begin__
+        # Setup trajectory views (`memory-inference` x past memory outs).
+        for i in range(self.num_transformer_units):
+            space = Box(-1.0, 1.0, shape=(self.attention_dim,))
+            self.view_requirements["state_in_{}".format(i)] = ViewRequirement(
+                "state_out_{}".format(i),
+                shift="-{}:-1".format(self.memory_inference),
+                # Repeat the incoming state every max-seq-len times.
+                batch_repeat_value=self.max_seq_len,
+                space=space,
+            )
+            self.view_requirements["state_out_{}".format(i)] = ViewRequirement(
+                space=space, used_for_training=False
+            )
+        # __sphinx_doc_end__
+
+    @override(ModelV2)
+    def forward(
+        self, input_dict, state: List[TensorType], seq_lens: TensorType
+    ) -> (TensorType, List[TensorType]):
+        assert seq_lens is not None
+
+        # Add the time dim to observations.
+        B = tf.shape(seq_lens)[0]
+        observations = input_dict[SampleBatch.OBS]
+
+        shape = tf.shape(observations)
+        T = shape[0] // B
+        observations = tf.reshape(observations, tf.concat([[-1, T], shape[1:]], axis=0))
+
+        all_out = self.trxl_model([observations] + state)
+
+        if self._logits is not None:
+            out = tf.reshape(all_out[0], [-1, self.num_outputs])
+            self._value_out = all_out[1]
+            memory_outs = all_out[2:]
+        else:
+            out = tf.reshape(all_out[0], [-1, self.attention_dim])
+            memory_outs = all_out[1:]
+
+        return out, [tf.reshape(m, [-1, self.attention_dim]) for m in memory_outs]
+
+    @override(RecurrentNetwork)
+    def get_initial_state(self) -> List[np.ndarray]:
+        return [
+            tf.zeros(self.view_requirements["state_in_{}".format(i)].space.shape)
+            for i in range(self.num_transformer_units)
+        ]
+
+    @override(ModelV2)
+    def value_function(self) -> TensorType:
+        return tf.reshape(self._value_out, [-1])
+
+
+class AttentionWrapper(TFModelV2):
+    """GTrXL wrapper serving as interface for ModelV2s that set use_attention."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        if log_once("attention_wrapper_tf_deprecation"):
+            deprecation_warning(
+                old="ray.rllib.models.tf.attention_net.AttentionWrapper"
+            )
+        super().__init__(obs_space, action_space, None, model_config, name)
+
+        self.use_n_prev_actions = model_config["attention_use_n_prev_actions"]
+        self.use_n_prev_rewards = model_config["attention_use_n_prev_rewards"]
+
+        self.action_space_struct = get_base_struct_from_space(self.action_space)
+        self.action_dim = 0
+
+        for space in tree.flatten(self.action_space_struct):
+            if isinstance(space, Discrete):
+                self.action_dim += space.n
+            elif isinstance(space, MultiDiscrete):
+                self.action_dim += np.sum(space.nvec)
+            elif space.shape is not None:
+                self.action_dim += int(np.prod(space.shape))
+            else:
+                self.action_dim += int(len(space))
+
+        # Add prev-action/reward nodes to input to LSTM.
+        if self.use_n_prev_actions:
+            self.num_outputs += self.use_n_prev_actions * self.action_dim
+        if self.use_n_prev_rewards:
+            self.num_outputs += self.use_n_prev_rewards
+
+        cfg = model_config
+
+        self.attention_dim = cfg["attention_dim"]
+
+        if self.num_outputs is not None:
+            in_space = gym.spaces.Box(
+                float("-inf"), float("inf"), shape=(self.num_outputs,), dtype=np.float32
+            )
+        else:
+            in_space = obs_space
+
+        # Construct GTrXL sub-module w/ num_outputs=None (so it does not
+        # create a logits/value output; we'll do this ourselves in this wrapper
+        # here).
+        self.gtrxl = GTrXLNet(
+            in_space,
+            action_space,
+            None,
+            model_config,
+            "gtrxl",
+            num_transformer_units=cfg["attention_num_transformer_units"],
+            attention_dim=self.attention_dim,
+            num_heads=cfg["attention_num_heads"],
+            head_dim=cfg["attention_head_dim"],
+            memory_inference=cfg["attention_memory_inference"],
+            memory_training=cfg["attention_memory_training"],
+            position_wise_mlp_dim=cfg["attention_position_wise_mlp_dim"],
+            init_gru_gate_bias=cfg["attention_init_gru_gate_bias"],
+        )
+
+        # `self.num_outputs` right now is the number of nodes coming from the
+        # attention net.
+        input_ = tf.keras.layers.Input(shape=(self.gtrxl.num_outputs,))
+
+        # Set final num_outputs to correct value (depending on action space).
+        self.num_outputs = num_outputs
+
+        # Postprocess GTrXL output with another hidden layer and compute
+        # values.
+        out = tf.keras.layers.Dense(self.num_outputs, activation=None)(input_)
+        self._logits_branch = tf.keras.models.Model([input_], [out])
+
+        out = tf.keras.layers.Dense(1, activation=None)(input_)
+        self._value_branch = tf.keras.models.Model([input_], [out])
+
+        self.view_requirements = self.gtrxl.view_requirements
+        self.view_requirements["obs"].space = self.obs_space
+
+        # Add prev-a/r to this model's view, if required.
+        if self.use_n_prev_actions:
+            self.view_requirements[SampleBatch.PREV_ACTIONS] = ViewRequirement(
+                SampleBatch.ACTIONS,
+                space=self.action_space,
+                shift="-{}:-1".format(self.use_n_prev_actions),
+            )
+        if self.use_n_prev_rewards:
+            self.view_requirements[SampleBatch.PREV_REWARDS] = ViewRequirement(
+                SampleBatch.REWARDS, shift="-{}:-1".format(self.use_n_prev_rewards)
+            )
+
+    @override(RecurrentNetwork)
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> (TensorType, List[TensorType]):
+        assert seq_lens is not None
+        # Push obs through "unwrapped" net's `forward()` first.
+        wrapped_out, _ = self._wrapped_forward(input_dict, [], None)
+
+        # Concat. prev-action/reward if required.
+        prev_a_r = []
+
+        # Prev actions.
+        if self.use_n_prev_actions:
+            prev_n_actions = input_dict[SampleBatch.PREV_ACTIONS]
+            # If actions are not processed yet (in their original form as
+            # have been sent to environment):
+            # Flatten/one-hot into 1D array.
+            if self.model_config["_disable_action_flattening"]:
+                # Merge prev n actions into flat tensor.
+                flat = flatten_inputs_to_1d_tensor(
+                    prev_n_actions,
+                    spaces_struct=self.action_space_struct,
+                    time_axis=True,
+                )
+                # Fold time-axis into flattened data.
+                flat = tf.reshape(flat, [tf.shape(flat)[0], -1])
+                prev_a_r.append(flat)
+            # If actions are already flattened (but not one-hot'd yet!),
+            # one-hot discrete/multi-discrete actions here and concatenate the
+            # n most recent actions together.
+            else:
+                if isinstance(self.action_space, Discrete):
+                    for i in range(self.use_n_prev_actions):
+                        prev_a_r.append(
+                            one_hot(prev_n_actions[:, i], self.action_space)
+                        )
+                elif isinstance(self.action_space, MultiDiscrete):
+                    for i in range(
+                        0, self.use_n_prev_actions, self.action_space.shape[0]
+                    ):
+                        prev_a_r.append(
+                            one_hot(
+                                tf.cast(
+                                    prev_n_actions[
+                                        :, i : i + self.action_space.shape[0]
+                                    ],
+                                    tf.float32,
+                                ),
+                                space=self.action_space,
+                            )
+                        )
+                else:
+                    prev_a_r.append(
+                        tf.reshape(
+                            tf.cast(prev_n_actions, tf.float32),
+                            [-1, self.use_n_prev_actions * self.action_dim],
+                        )
+                    )
+        # Prev rewards.
+        if self.use_n_prev_rewards:
+            prev_a_r.append(
+                tf.reshape(
+                    tf.cast(input_dict[SampleBatch.PREV_REWARDS], tf.float32),
+                    [-1, self.use_n_prev_rewards],
+                )
+            )
+
+        # Concat prev. actions + rewards to the "main" input.
+        if prev_a_r:
+            wrapped_out = tf.concat([wrapped_out] + prev_a_r, axis=1)
+
+        # Then through our GTrXL.
+        input_dict["obs_flat"] = input_dict["obs"] = wrapped_out
+
+        self._features, memory_outs = self.gtrxl(input_dict, state, seq_lens)
+        model_out = self._logits_branch(self._features)
+        return model_out, memory_outs
+
+    @override(ModelV2)
+    def value_function(self) -> TensorType:
+        assert self._features is not None, "Must call forward() first!"
+        return tf.reshape(self._value_branch(self._features), [-1])
+
+    @override(ModelV2)
+    def get_initial_state(self) -> Union[List[np.ndarray], List[TensorType]]:
+        return [
+            np.zeros(self.gtrxl.view_requirements["state_in_{}".format(i)].space.shape)
+            for i in range(self.gtrxl.num_transformer_units)
+        ]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/complex_input_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/complex_input_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c41be4067a1473107a39a1f4f7ec94d8a99f27
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/complex_input_net.py
@@ -0,0 +1,214 @@
+from gymnasium.spaces import Box, Discrete, MultiDiscrete
+import numpy as np
+import tree  # pip install dm_tree
+
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.modelv2 import ModelV2, restore_original_dimensions
+from ray.rllib.models.tf.misc import normc_initializer
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.utils import get_filter_config
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.spaces.space_utils import flatten_space
+from ray.rllib.utils.tf_utils import one_hot
+
+tf1, tf, tfv = try_import_tf()
+
+
+# __sphinx_doc_begin__
+@OldAPIStack
+class ComplexInputNetwork(TFModelV2):
+    """TFModelV2 concat'ing CNN outputs to flat input(s), followed by FC(s).
+
+    Note: This model should be used for complex (Dict or Tuple) observation
+    spaces that have one or more image components.
+
+    The data flow is as follows:
+
+    `obs` (e.g. Tuple[img0, img1, discrete0]) -> `CNN0 + CNN1 + ONE-HOT`
+    `CNN0 + CNN1 + ONE-HOT` -> concat all flat outputs -> `out`
+    `out` -> (optional) FC-stack -> `out2`
+    `out2` -> action (logits) and vaulue heads.
+    """
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+
+        self.original_space = (
+            obs_space.original_space
+            if hasattr(obs_space, "original_space")
+            else obs_space
+        )
+
+        self.processed_obs_space = (
+            self.original_space
+            if model_config.get("_disable_preprocessor_api")
+            else obs_space
+        )
+        super().__init__(
+            self.original_space, action_space, num_outputs, model_config, name
+        )
+
+        self.flattened_input_space = flatten_space(self.original_space)
+
+        # Build the CNN(s) given obs_space's image components.
+        self.cnns = {}
+        self.one_hot = {}
+        self.flatten_dims = {}
+        self.flatten = {}
+        concat_size = 0
+        for i, component in enumerate(self.flattened_input_space):
+            # Image space.
+            if len(component.shape) == 3 and isinstance(component, Box):
+                config = {
+                    "conv_filters": model_config["conv_filters"]
+                    if "conv_filters" in model_config
+                    else get_filter_config(component.shape),
+                    "conv_activation": model_config.get("conv_activation"),
+                    "post_fcnet_hiddens": [],
+                }
+                self.cnns[i] = ModelCatalog.get_model_v2(
+                    component,
+                    action_space,
+                    num_outputs=None,
+                    model_config=config,
+                    framework="tf",
+                    name="cnn_{}".format(i),
+                )
+                concat_size += int(self.cnns[i].num_outputs)
+            # Discrete|MultiDiscrete inputs -> One-hot encode.
+            elif isinstance(component, (Discrete, MultiDiscrete)):
+                if isinstance(component, Discrete):
+                    size = component.n
+                else:
+                    size = np.sum(component.nvec)
+                config = {
+                    "fcnet_hiddens": model_config["fcnet_hiddens"],
+                    "fcnet_activation": model_config.get("fcnet_activation"),
+                    "post_fcnet_hiddens": [],
+                }
+                self.one_hot[i] = ModelCatalog.get_model_v2(
+                    Box(-1.0, 1.0, (size,), np.float32),
+                    action_space,
+                    num_outputs=None,
+                    model_config=config,
+                    framework="tf",
+                    name="one_hot_{}".format(i),
+                )
+                concat_size += int(self.one_hot[i].num_outputs)
+            # Everything else (1D Box).
+            else:
+                size = int(np.prod(component.shape))
+                config = {
+                    "fcnet_hiddens": model_config["fcnet_hiddens"],
+                    "fcnet_activation": model_config.get("fcnet_activation"),
+                    "post_fcnet_hiddens": [],
+                }
+                self.flatten[i] = ModelCatalog.get_model_v2(
+                    Box(-1.0, 1.0, (size,), np.float32),
+                    action_space,
+                    num_outputs=None,
+                    model_config=config,
+                    framework="tf",
+                    name="flatten_{}".format(i),
+                )
+                self.flatten_dims[i] = size
+                concat_size += int(self.flatten[i].num_outputs)
+
+        # Optional post-concat FC-stack.
+        post_fc_stack_config = {
+            "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []),
+            "fcnet_activation": model_config.get("post_fcnet_activation", "relu"),
+        }
+        self.post_fc_stack = ModelCatalog.get_model_v2(
+            Box(float("-inf"), float("inf"), shape=(concat_size,), dtype=np.float32),
+            self.action_space,
+            None,
+            post_fc_stack_config,
+            framework="tf",
+            name="post_fc_stack",
+        )
+
+        # Actions and value heads.
+        self.logits_and_value_model = None
+        self._value_out = None
+        if num_outputs:
+            # Action-distribution head.
+            concat_layer = tf.keras.layers.Input((self.post_fc_stack.num_outputs,))
+            logits_layer = tf.keras.layers.Dense(
+                num_outputs,
+                activation=None,
+                kernel_initializer=normc_initializer(0.01),
+                name="logits",
+            )(concat_layer)
+
+            # Create the value branch model.
+            value_layer = tf.keras.layers.Dense(
+                1,
+                activation=None,
+                kernel_initializer=normc_initializer(0.01),
+                name="value_out",
+            )(concat_layer)
+            self.logits_and_value_model = tf.keras.models.Model(
+                concat_layer, [logits_layer, value_layer]
+            )
+        else:
+            self.num_outputs = self.post_fc_stack.num_outputs
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        if SampleBatch.OBS in input_dict and "obs_flat" in input_dict:
+            orig_obs = input_dict[SampleBatch.OBS]
+        else:
+            orig_obs = restore_original_dimensions(
+                input_dict[SampleBatch.OBS], self.processed_obs_space, tensorlib="tf"
+            )
+        # Push image observations through our CNNs.
+        outs = []
+        for i, component in enumerate(tree.flatten(orig_obs)):
+            if i in self.cnns:
+                cnn_out, _ = self.cnns[i](SampleBatch({SampleBatch.OBS: component}))
+                outs.append(cnn_out)
+            elif i in self.one_hot:
+                if "int" in component.dtype.name:
+                    one_hot_in = {
+                        SampleBatch.OBS: one_hot(
+                            component, self.flattened_input_space[i]
+                        )
+                    }
+                else:
+                    one_hot_in = {SampleBatch.OBS: component}
+                one_hot_out, _ = self.one_hot[i](SampleBatch(one_hot_in))
+                outs.append(one_hot_out)
+            else:
+                nn_out, _ = self.flatten[i](
+                    SampleBatch(
+                        {
+                            SampleBatch.OBS: tf.cast(
+                                tf.reshape(component, [-1, self.flatten_dims[i]]),
+                                tf.float32,
+                            )
+                        }
+                    )
+                )
+                outs.append(nn_out)
+        # Concat all outputs and the non-image inputs.
+        out = tf.concat(outs, axis=1)
+        # Push through (optional) FC-stack (this may be an empty stack).
+        out, _ = self.post_fc_stack(SampleBatch({SampleBatch.OBS: out}))
+
+        # No logits/value branches.
+        if not self.logits_and_value_model:
+            return out, []
+
+        # Logits- and value branches.
+        logits, values = self.logits_and_value_model(out)
+        self._value_out = tf.reshape(values, [-1])
+        return logits, []
+
+    @override(ModelV2)
+    def value_function(self):
+        return self._value_out
+
+
+# __sphinx_doc_end__
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/fcnet.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/fcnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..56a09de0361acf371f5d6b8ab60fc7d2790565c7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/fcnet.py
@@ -0,0 +1,148 @@
+import numpy as np
+import gymnasium as gym
+from typing import Dict
+
+from ray.rllib.models.tf.misc import normc_initializer
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.utils import get_activation_fn
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import TensorType, List, ModelConfigDict
+
+tf1, tf, tfv = try_import_tf()
+
+
+@OldAPIStack
+class FullyConnectedNetwork(TFModelV2):
+    """Generic fully connected network implemented in ModelV2 API."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        super(FullyConnectedNetwork, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+
+        hiddens = list(model_config.get("fcnet_hiddens", [])) + list(
+            model_config.get("post_fcnet_hiddens", [])
+        )
+        activation = model_config.get("fcnet_activation")
+        if not model_config.get("fcnet_hiddens", []):
+            activation = model_config.get("post_fcnet_activation")
+        activation = get_activation_fn(activation)
+        no_final_linear = model_config.get("no_final_linear")
+        vf_share_layers = model_config.get("vf_share_layers")
+        free_log_std = model_config.get("free_log_std")
+
+        # Generate free-floating bias variables for the second half of
+        # the outputs.
+        if free_log_std:
+            assert num_outputs % 2 == 0, (
+                "num_outputs must be divisible by two",
+                num_outputs,
+            )
+            num_outputs = num_outputs // 2
+            self.log_std_var = tf.Variable(
+                [0.0] * num_outputs, dtype=tf.float32, name="log_std"
+            )
+
+        # We are using obs_flat, so take the flattened shape as input.
+        inputs = tf.keras.layers.Input(
+            shape=(int(np.prod(obs_space.shape)),), name="observations"
+        )
+        # Last hidden layer output (before logits outputs).
+        last_layer = inputs
+        # The action distribution outputs.
+        logits_out = None
+        i = 1
+
+        # Create layers 0 to second-last.
+        for size in hiddens[:-1]:
+            last_layer = tf.keras.layers.Dense(
+                size,
+                name="fc_{}".format(i),
+                activation=activation,
+                kernel_initializer=normc_initializer(1.0),
+            )(last_layer)
+            i += 1
+
+        # The last layer is adjusted to be of size num_outputs, but it's a
+        # layer with activation.
+        if no_final_linear and num_outputs:
+            logits_out = tf.keras.layers.Dense(
+                num_outputs,
+                name="fc_out",
+                activation=activation,
+                kernel_initializer=normc_initializer(1.0),
+            )(last_layer)
+        # Finish the layers with the provided sizes (`hiddens`), plus -
+        # iff num_outputs > 0 - a last linear layer of size num_outputs.
+        else:
+            if len(hiddens) > 0:
+                last_layer = tf.keras.layers.Dense(
+                    hiddens[-1],
+                    name="fc_{}".format(i),
+                    activation=activation,
+                    kernel_initializer=normc_initializer(1.0),
+                )(last_layer)
+            if num_outputs:
+                logits_out = tf.keras.layers.Dense(
+                    num_outputs,
+                    name="fc_out",
+                    activation=None,
+                    kernel_initializer=normc_initializer(0.01),
+                )(last_layer)
+            # Adjust num_outputs to be the number of nodes in the last layer.
+            else:
+                self.num_outputs = ([int(np.prod(obs_space.shape))] + hiddens[-1:])[-1]
+
+        # Concat the log std vars to the end of the state-dependent means.
+        if free_log_std and logits_out is not None:
+
+            def tiled_log_std(x):
+                return tf.tile(tf.expand_dims(self.log_std_var, 0), [tf.shape(x)[0], 1])
+
+            log_std_out = tf.keras.layers.Lambda(tiled_log_std)(inputs)
+            logits_out = tf.keras.layers.Concatenate(axis=1)([logits_out, log_std_out])
+
+        last_vf_layer = None
+        if not vf_share_layers:
+            # Build a parallel set of hidden layers for the value net.
+            last_vf_layer = inputs
+            i = 1
+            for size in hiddens:
+                last_vf_layer = tf.keras.layers.Dense(
+                    size,
+                    name="fc_value_{}".format(i),
+                    activation=activation,
+                    kernel_initializer=normc_initializer(1.0),
+                )(last_vf_layer)
+                i += 1
+
+        value_out = tf.keras.layers.Dense(
+            1,
+            name="value_out",
+            activation=None,
+            kernel_initializer=normc_initializer(0.01),
+        )(last_vf_layer if last_vf_layer is not None else last_layer)
+
+        self.base_model = tf.keras.Model(
+            inputs, [(logits_out if logits_out is not None else last_layer), value_out]
+        )
+
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> (TensorType, List[TensorType]):
+        model_out, self._value_out = self.base_model(input_dict["obs_flat"])
+        return model_out, state
+
+    def value_function(self) -> TensorType:
+        return tf.reshape(self._value_out, [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/misc.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea75e423c2d66756b9e899faa2c6487dd57cab4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/misc.py
@@ -0,0 +1,90 @@
+import numpy as np
+from typing import Tuple, Any, Optional
+
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import TensorType
+
+tf1, tf, tfv = try_import_tf()
+
+
+# TODO: (sven) obsolete this class.
+@DeveloperAPI
+def normc_initializer(std: float = 1.0) -> Any:
+    def _initializer(shape, dtype=None, partition_info=None):
+        out = np.random.randn(*shape).astype(
+            dtype.name if hasattr(dtype, "name") else dtype or np.float32
+        )
+        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
+        return tf.constant(out)
+
+    return _initializer
+
+
+@DeveloperAPI
+def conv2d(
+    x: TensorType,
+    num_filters: int,
+    name: str,
+    filter_size: Tuple[int, int] = (3, 3),
+    stride: Tuple[int, int] = (1, 1),
+    pad: str = "SAME",
+    dtype: Optional[Any] = None,
+    collections: Optional[Any] = None,
+) -> TensorType:
+
+    if dtype is None:
+        dtype = tf.float32
+
+    with tf1.variable_scope(name):
+        stride_shape = [1, stride[0], stride[1], 1]
+        filter_shape = [
+            filter_size[0],
+            filter_size[1],
+            int(x.get_shape()[3]),
+            num_filters,
+        ]
+
+        # There are "num input feature maps * filter height * filter width"
+        # inputs to each hidden unit.
+        fan_in = np.prod(filter_shape[:3])
+        # Each unit in the lower layer receives a gradient from: "num output
+        # feature maps * filter height * filter width" / pooling size.
+        fan_out = np.prod(filter_shape[:2]) * num_filters
+        # Initialize weights with random weights.
+        w_bound = np.sqrt(6 / (fan_in + fan_out))
+
+        w = tf1.get_variable(
+            "W",
+            filter_shape,
+            dtype,
+            tf1.random_uniform_initializer(-w_bound, w_bound),
+            collections=collections,
+        )
+        b = tf1.get_variable(
+            "b",
+            [1, 1, 1, num_filters],
+            initializer=tf1.constant_initializer(0.0),
+            collections=collections,
+        )
+        return tf1.nn.conv2d(x, w, stride_shape, pad) + b
+
+
+@DeveloperAPI
+def linear(
+    x: TensorType,
+    size: int,
+    name: str,
+    initializer: Optional[Any] = None,
+    bias_init: float = 0.0,
+) -> TensorType:
+    w = tf1.get_variable(name + "/w", [x.get_shape()[1], size], initializer=initializer)
+    b = tf1.get_variable(
+        name + "/b", [size], initializer=tf1.constant_initializer(bias_init)
+    )
+    return tf.matmul(x, w) + b
+
+
+@DeveloperAPI
+def flatten(x: TensorType) -> TensorType:
+    return tf.reshape(x, [-1, np.prod(x.get_shape().as_list()[1:])])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/recurrent_net.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/recurrent_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..2010d4a901188a53cec5d2766e865392c1d7f9d0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/recurrent_net.py
@@ -0,0 +1,292 @@
+import numpy as np
+import gymnasium as gym
+from gymnasium.spaces import Discrete, MultiDiscrete
+import logging
+import tree  # pip install dm_tree
+from typing import Dict, List, Tuple
+
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.policy.rnn_sequencing import add_time_dimension
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.view_requirement import ViewRequirement
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.tf_utils import flatten_inputs_to_1d_tensor, one_hot
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.util.debug import log_once
+
+tf1, tf, tfv = try_import_tf()
+logger = logging.getLogger(__name__)
+
+
+@OldAPIStack
+class RecurrentNetwork(TFModelV2):
+    """Helper class to simplify implementing RNN models with TFModelV2.
+
+    Instead of implementing forward(), you can implement forward_rnn() which
+    takes batches with the time dimension added already.
+
+    Here is an example implementation for a subclass
+    ``MyRNNClass(RecurrentNetwork)``::
+
+        def __init__(self, *args, **kwargs):
+            super(MyModelClass, self).__init__(*args, **kwargs)
+            cell_size = 256
+
+            # Define input layers
+            input_layer = tf.keras.layers.Input(
+                shape=(None, obs_space.shape[0]))
+            state_in_h = tf.keras.layers.Input(shape=(256, ))
+            state_in_c = tf.keras.layers.Input(shape=(256, ))
+            seq_in = tf.keras.layers.Input(shape=(), dtype=tf.int32)
+
+            # Send to LSTM cell
+            lstm_out, state_h, state_c = tf.keras.layers.LSTM(
+                cell_size, return_sequences=True, return_state=True,
+                name="lstm")(
+                    inputs=input_layer,
+                    mask=tf.sequence_mask(seq_in),
+                    initial_state=[state_in_h, state_in_c])
+            output_layer = tf.keras.layers.Dense(...)(lstm_out)
+
+            # Create the RNN model
+            self.rnn_model = tf.keras.Model(
+                inputs=[input_layer, seq_in, state_in_h, state_in_c],
+                outputs=[output_layer, state_h, state_c])
+            self.rnn_model.summary()
+    """
+
+    @override(ModelV2)
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> Tuple[TensorType, List[TensorType]]:
+        """Adds time dimension to batch before sending inputs to forward_rnn().
+
+        You should implement forward_rnn() in your subclass."""
+        # Creating a __init__ function that acts as a passthrough and adding the warning
+        # there led to errors probably due to the multiple inheritance. We encountered
+        # the same error if we add the Deprecated decorator. We therefore add the
+        # deprecation warning here.
+        if log_once("recurrent_network_tf"):
+            deprecation_warning(
+                old="ray.rllib.models.tf.recurrent_net.RecurrentNetwork"
+            )
+        assert seq_lens is not None
+        flat_inputs = input_dict["obs_flat"]
+        inputs = add_time_dimension(
+            padded_inputs=flat_inputs, seq_lens=seq_lens, framework="tf"
+        )
+        output, new_state = self.forward_rnn(
+            inputs,
+            state,
+            seq_lens,
+        )
+        return tf.reshape(output, [-1, self.num_outputs]), new_state
+
+    def forward_rnn(
+        self, inputs: TensorType, state: List[TensorType], seq_lens: TensorType
+    ) -> Tuple[TensorType, List[TensorType]]:
+        """Call the model with the given input tensors and state.
+
+        Args:
+            inputs: observation tensor with shape [B, T, obs_size].
+            state: list of state tensors, each with shape [B, T, size].
+            seq_lens: 1d tensor holding input sequence lengths.
+
+        Returns:
+            (outputs, new_state): The model output tensor of shape
+                [B, T, num_outputs] and the list of new state tensors each with
+                shape [B, size].
+
+        Sample implementation for the ``MyRNNClass`` example::
+
+            def forward_rnn(self, inputs, state, seq_lens):
+                model_out, h, c = self.rnn_model([inputs, seq_lens] + state)
+                return model_out, [h, c]
+        """
+        raise NotImplementedError("You must implement this for a RNN model")
+
+    def get_initial_state(self) -> List[TensorType]:
+        """Get the initial recurrent state values for the model.
+
+        Returns:
+            list of np.array objects, if any
+
+        Sample implementation for the ``MyRNNClass`` example::
+
+            def get_initial_state(self):
+                return [
+                    np.zeros(self.cell_size, np.float32),
+                    np.zeros(self.cell_size, np.float32),
+                ]
+        """
+        raise NotImplementedError("You must implement this for a RNN model")
+
+
+@OldAPIStack
+class LSTMWrapper(RecurrentNetwork):
+    """An LSTM wrapper serving as an interface for ModelV2s that set use_lstm."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        super(LSTMWrapper, self).__init__(
+            obs_space, action_space, None, model_config, name
+        )
+        # At this point, self.num_outputs is the number of nodes coming
+        # from the wrapped (underlying) model. In other words, self.num_outputs
+        # is the input size for the LSTM layer.
+        # If None, set it to the observation space.
+        if self.num_outputs is None:
+            self.num_outputs = int(np.prod(self.obs_space.shape))
+
+        self.cell_size = model_config["lstm_cell_size"]
+        self.use_prev_action = model_config["lstm_use_prev_action"]
+        self.use_prev_reward = model_config["lstm_use_prev_reward"]
+
+        self.action_space_struct = get_base_struct_from_space(self.action_space)
+        self.action_dim = 0
+
+        for space in tree.flatten(self.action_space_struct):
+            if isinstance(space, Discrete):
+                self.action_dim += space.n
+            elif isinstance(space, MultiDiscrete):
+                self.action_dim += np.sum(space.nvec)
+            elif space.shape is not None:
+                self.action_dim += int(np.prod(space.shape))
+            else:
+                self.action_dim += int(len(space))
+
+        # Add prev-action/reward nodes to input to LSTM.
+        if self.use_prev_action:
+            self.num_outputs += self.action_dim
+        if self.use_prev_reward:
+            self.num_outputs += 1
+
+        # Define input layers.
+        input_layer = tf.keras.layers.Input(
+            shape=(None, self.num_outputs), name="inputs"
+        )
+
+        # Set self.num_outputs to the number of output nodes desired by the
+        # caller of this constructor.
+        self.num_outputs = num_outputs
+
+        state_in_h = tf.keras.layers.Input(shape=(self.cell_size,), name="h")
+        state_in_c = tf.keras.layers.Input(shape=(self.cell_size,), name="c")
+        seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)
+
+        # Preprocess observation with a hidden layer and send to LSTM cell
+        lstm_out, state_h, state_c = tf.keras.layers.LSTM(
+            self.cell_size, return_sequences=True, return_state=True, name="lstm"
+        )(
+            inputs=input_layer,
+            mask=tf.sequence_mask(seq_in),
+            initial_state=[state_in_h, state_in_c],
+        )
+
+        # Postprocess LSTM output with another hidden layer and compute values
+        logits = tf.keras.layers.Dense(
+            self.num_outputs, activation=tf.keras.activations.linear, name="logits"
+        )(lstm_out)
+        values = tf.keras.layers.Dense(1, activation=None, name="values")(lstm_out)
+
+        # Create the RNN model
+        self._rnn_model = tf.keras.Model(
+            inputs=[input_layer, seq_in, state_in_h, state_in_c],
+            outputs=[logits, values, state_h, state_c],
+        )
+        # Print out model summary in INFO logging mode.
+        if logger.isEnabledFor(logging.INFO):
+            self._rnn_model.summary()
+
+        # Add prev-a/r to this model's view, if required.
+        if model_config["lstm_use_prev_action"]:
+            self.view_requirements[SampleBatch.PREV_ACTIONS] = ViewRequirement(
+                SampleBatch.ACTIONS, space=self.action_space, shift=-1
+            )
+        if model_config["lstm_use_prev_reward"]:
+            self.view_requirements[SampleBatch.PREV_REWARDS] = ViewRequirement(
+                SampleBatch.REWARDS, shift=-1
+            )
+
+    @override(RecurrentNetwork)
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> Tuple[TensorType, List[TensorType]]:
+        assert seq_lens is not None
+        # Push obs through "unwrapped" net's `forward()` first.
+        wrapped_out, _ = self._wrapped_forward(input_dict, [], None)
+
+        # Concat. prev-action/reward if required.
+        prev_a_r = []
+
+        # Prev actions.
+        if self.model_config["lstm_use_prev_action"]:
+            prev_a = input_dict[SampleBatch.PREV_ACTIONS]
+            # If actions are not processed yet (in their original form as
+            # have been sent to environment):
+            # Flatten/one-hot into 1D array.
+            if self.model_config["_disable_action_flattening"]:
+                prev_a_r.append(
+                    flatten_inputs_to_1d_tensor(
+                        prev_a,
+                        spaces_struct=self.action_space_struct,
+                        time_axis=False,
+                    )
+                )
+            # If actions are already flattened (but not one-hot'd yet!),
+            # one-hot discrete/multi-discrete actions here.
+            else:
+                if isinstance(self.action_space, (Discrete, MultiDiscrete)):
+                    prev_a = one_hot(prev_a, self.action_space)
+                prev_a_r.append(
+                    tf.reshape(tf.cast(prev_a, tf.float32), [-1, self.action_dim])
+                )
+        # Prev rewards.
+        if self.model_config["lstm_use_prev_reward"]:
+            prev_a_r.append(
+                tf.reshape(
+                    tf.cast(input_dict[SampleBatch.PREV_REWARDS], tf.float32), [-1, 1]
+                )
+            )
+
+        # Concat prev. actions + rewards to the "main" input.
+        if prev_a_r:
+            wrapped_out = tf.concat([wrapped_out] + prev_a_r, axis=1)
+
+        # Push everything through our LSTM.
+        input_dict["obs_flat"] = wrapped_out
+        return super().forward(input_dict, state, seq_lens)
+
+    @override(RecurrentNetwork)
+    def forward_rnn(
+        self, inputs: TensorType, state: List[TensorType], seq_lens: TensorType
+    ) -> Tuple[TensorType, List[TensorType]]:
+        model_out, self._value_out, h, c = self._rnn_model([inputs, seq_lens] + state)
+        return model_out, [h, c]
+
+    @override(ModelV2)
+    def get_initial_state(self) -> List[np.ndarray]:
+        return [
+            np.zeros(self.cell_size, np.float32),
+            np.zeros(self.cell_size, np.float32),
+        ]
+
+    @override(ModelV2)
+    def value_function(self) -> TensorType:
+        return tf.reshape(self._value_out, [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_action_dist.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_action_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..683d1939776d3f346ec5f52e97ca5b95e049cdb6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_action_dist.py
@@ -0,0 +1,735 @@
+import functools
+import gymnasium as gym
+from math import log
+import numpy as np
+import tree  # pip install dm_tree
+from typing import Optional
+
+from ray.rllib.models.action_dist import ActionDistribution
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.utils import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT, SMALL_NUMBER
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
+from ray.rllib.utils.typing import TensorType, List, Union, Tuple, ModelConfigDict
+
+tf1, tf, tfv = try_import_tf()
+tfp = try_import_tfp()
+
+
+@OldAPIStack
+class TFActionDistribution(ActionDistribution):
+    """TF-specific extensions for building action distributions."""
+
+    @override(ActionDistribution)
+    def __init__(self, inputs: List[TensorType], model: ModelV2):
+        super().__init__(inputs, model)
+        self.sample_op = self._build_sample_op()
+        self.sampled_action_logp_op = self.logp(self.sample_op)
+
+    def _build_sample_op(self) -> TensorType:
+        """Implement this instead of sample(), to enable op reuse.
+
+        This is needed since the sample op is non-deterministic and is shared
+        between sample() and sampled_action_logp().
+        """
+        raise NotImplementedError
+
+    @override(ActionDistribution)
+    def sample(self) -> TensorType:
+        """Draw a sample from the action distribution."""
+        return self.sample_op
+
+    @override(ActionDistribution)
+    def sampled_action_logp(self) -> TensorType:
+        """Returns the log probability of the sampled action."""
+        return self.sampled_action_logp_op
+
+
+@OldAPIStack
+class Categorical(TFActionDistribution):
+    """Categorical distribution for discrete action spaces."""
+
+    def __init__(
+        self, inputs: List[TensorType], model: ModelV2 = None, temperature: float = 1.0
+    ):
+        assert temperature > 0.0, "Categorical `temperature` must be > 0.0!"
+        # Allow softmax formula w/ temperature != 1.0:
+        # Divide inputs by temperature.
+        super().__init__(inputs / temperature, model)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        return tf.math.argmax(self.inputs, axis=1)
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        return -tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=self.inputs, labels=tf.cast(x, tf.int32)
+        )
+
+    @override(ActionDistribution)
+    def entropy(self) -> TensorType:
+        a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keepdims=True)
+        ea0 = tf.exp(a0)
+        z0 = tf.reduce_sum(ea0, axis=1, keepdims=True)
+        p0 = ea0 / z0
+        return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=1)
+
+    @override(ActionDistribution)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keepdims=True)
+        a1 = other.inputs - tf.reduce_max(other.inputs, axis=1, keepdims=True)
+        ea0 = tf.exp(a0)
+        ea1 = tf.exp(a1)
+        z0 = tf.reduce_sum(ea0, axis=1, keepdims=True)
+        z1 = tf.reduce_sum(ea1, axis=1, keepdims=True)
+        p0 = ea0 / z0
+        return tf.reduce_sum(p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=1)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        return tf.squeeze(tf.random.categorical(self.inputs, 1), axis=1)
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(action_space, model_config):
+        return action_space.n
+
+
+@OldAPIStack
+def get_categorical_class_with_temperature(t: float):
+    """Categorical distribution class that has customized default temperature."""
+
+    class CategoricalWithTemperature(Categorical):
+        def __init__(self, inputs, model=None, temperature=t):
+            super().__init__(inputs, model, temperature)
+
+    return CategoricalWithTemperature
+
+
+@OldAPIStack
+class MultiCategorical(TFActionDistribution):
+    """MultiCategorical distribution for MultiDiscrete action spaces."""
+
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: ModelV2,
+        input_lens: Union[List[int], np.ndarray, Tuple[int, ...]],
+        action_space=None,
+    ):
+        # skip TFActionDistribution init
+        ActionDistribution.__init__(self, inputs, model)
+        self.cats = [
+            Categorical(input_, model)
+            for input_ in tf.split(inputs, input_lens, axis=1)
+        ]
+        self.action_space = action_space
+        if self.action_space is None:
+            self.action_space = gym.spaces.MultiDiscrete(
+                [c.inputs.shape[1] for c in self.cats]
+            )
+        self.sample_op = self._build_sample_op()
+        self.sampled_action_logp_op = self.logp(self.sample_op)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        sample_ = tf.stack([cat.deterministic_sample() for cat in self.cats], axis=1)
+        if isinstance(self.action_space, gym.spaces.Box):
+            return tf.cast(
+                tf.reshape(sample_, [-1] + list(self.action_space.shape)),
+                self.action_space.dtype,
+            )
+        return sample_
+
+    @override(ActionDistribution)
+    def logp(self, actions: TensorType) -> TensorType:
+        # If tensor is provided, unstack it into list.
+        if isinstance(actions, tf.Tensor):
+            if isinstance(self.action_space, gym.spaces.Box):
+                actions = tf.reshape(
+                    actions, [-1, int(np.prod(self.action_space.shape))]
+                )
+            elif isinstance(self.action_space, gym.spaces.MultiDiscrete):
+                actions.set_shape((None, len(self.cats)))
+            actions = tf.unstack(tf.cast(actions, tf.int32), axis=1)
+        logps = tf.stack([cat.logp(act) for cat, act in zip(self.cats, actions)])
+        return tf.reduce_sum(logps, axis=0)
+
+    @override(ActionDistribution)
+    def multi_entropy(self) -> TensorType:
+        return tf.stack([cat.entropy() for cat in self.cats], axis=1)
+
+    @override(ActionDistribution)
+    def entropy(self) -> TensorType:
+        return tf.reduce_sum(self.multi_entropy(), axis=1)
+
+    @override(ActionDistribution)
+    def multi_kl(self, other: ActionDistribution) -> TensorType:
+        return tf.stack(
+            [cat.kl(oth_cat) for cat, oth_cat in zip(self.cats, other.cats)], axis=1
+        )
+
+    @override(ActionDistribution)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        return tf.reduce_sum(self.multi_kl(other), axis=1)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        sample_op = tf.stack([cat.sample() for cat in self.cats], axis=1)
+        if isinstance(self.action_space, gym.spaces.Box):
+            return tf.cast(
+                tf.reshape(sample_op, [-1] + list(self.action_space.shape)),
+                dtype=self.action_space.dtype,
+            )
+        return sample_op
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        # Int Box.
+        if isinstance(action_space, gym.spaces.Box):
+            assert action_space.dtype.name.startswith("int")
+            low_ = np.min(action_space.low)
+            high_ = np.max(action_space.high)
+            assert np.all(action_space.low == low_)
+            assert np.all(action_space.high == high_)
+            return np.prod(action_space.shape, dtype=np.int32) * (high_ - low_ + 1)
+        # MultiDiscrete space.
+        else:
+            # nvec is already integer, so no casting needed.
+            return np.sum(action_space.nvec)
+
+
+@OldAPIStack
+class SlateMultiCategorical(Categorical):
+    """MultiCategorical distribution for MultiDiscrete action spaces.
+
+    The action space must be uniform, meaning all nvec items have the same size, e.g.
+    MultiDiscrete([10, 10, 10]), where 10 is the number of candidates to pick from
+    and 3 is the slate size (pick 3 out of 10). When picking candidates, no candidate
+    must be picked more than once.
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: ModelV2 = None,
+        temperature: float = 1.0,
+        action_space: Optional[gym.spaces.MultiDiscrete] = None,
+        all_slates=None,
+    ):
+        assert temperature > 0.0, "Categorical `temperature` must be > 0.0!"
+        # Allow softmax formula w/ temperature != 1.0:
+        # Divide inputs by temperature.
+        super().__init__(inputs / temperature, model)
+        self.action_space = action_space
+        # Assert uniformness of the action space (all discrete buckets have the same
+        # size).
+        assert isinstance(self.action_space, gym.spaces.MultiDiscrete) and all(
+            n == self.action_space.nvec[0] for n in self.action_space.nvec
+        )
+        self.all_slates = all_slates
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        # Get a sample from the underlying Categorical (batch of ints).
+        sample = super().deterministic_sample()
+        # Use the sampled ints to pick the actual slates.
+        return tf.gather(self.all_slates, sample)
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        # TODO: Implement.
+        return tf.ones_like(self.inputs[:, 0])
+
+
+@OldAPIStack
+class GumbelSoftmax(TFActionDistribution):
+    """GumbelSoftmax distr. (for differentiable sampling in discr. actions
+
+    The Gumbel Softmax distribution [1] (also known as the Concrete [2]
+    distribution) is a close cousin of the relaxed one-hot categorical
+    distribution, whose tfp implementation we will use here plus
+    adjusted `sample_...` and `log_prob` methods. See discussion at [0].
+
+    [0] https://stackoverflow.com/questions/56226133/
+    soft-actor-critic-with-discrete-action-space
+
+    [1] Categorical Reparametrization with Gumbel-Softmax (Jang et al, 2017):
+    https://arxiv.org/abs/1611.01144
+    [2] The Concrete Distribution: A Continuous Relaxation of Discrete Random
+    Variables (Maddison et al, 2017) https://arxiv.org/abs/1611.00712
+    """
+
+    def __init__(
+        self, inputs: List[TensorType], model: ModelV2 = None, temperature: float = 1.0
+    ):
+        """Initializes a GumbelSoftmax distribution.
+
+        Args:
+            temperature: Temperature parameter. For low temperatures,
+                the expected value approaches a categorical random variable.
+                For high temperatures, the expected value approaches a uniform
+                distribution.
+        """
+        assert temperature >= 0.0
+        self.dist = tfp.distributions.RelaxedOneHotCategorical(
+            temperature=temperature, logits=inputs
+        )
+        self.probs = tf.nn.softmax(self.dist._distribution.logits)
+        super().__init__(inputs, model)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        # Return the dist object's prob values.
+        return self.probs
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        # Override since the implementation of tfp.RelaxedOneHotCategorical
+        # yields positive values.
+        if x.shape != self.dist.logits.shape:
+            values = tf.one_hot(
+                x, self.dist.logits.shape.as_list()[-1], dtype=tf.float32
+            )
+            assert values.shape == self.dist.logits.shape, (
+                values.shape,
+                self.dist.logits.shape,
+            )
+
+        # [0]'s implementation (see line below) seems to be an approximation
+        # to the actual Gumbel Softmax density.
+        return -tf.reduce_sum(
+            -x * tf.nn.log_softmax(self.dist.logits, axis=-1), axis=-1
+        )
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        return self.dist.sample()
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return action_space.n
+
+
+@OldAPIStack
+class DiagGaussian(TFActionDistribution):
+    """Action distribution where each vector element is a gaussian.
+
+    The first half of the input vector defines the gaussian means, and the
+    second half the gaussian standard deviations.
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: ModelV2,
+        *,
+        action_space: Optional[gym.spaces.Space] = None
+    ):
+        mean, log_std = tf.split(inputs, 2, axis=1)
+        self.mean = mean
+        self.log_std = log_std
+        self.std = tf.exp(log_std)
+        # Remember to squeeze action samples in case action space is Box(shape)
+        self.zero_action_dim = action_space and action_space.shape == ()
+        super().__init__(inputs, model)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        return self.mean
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        # Cover case where action space is Box(shape=()).
+        if int(tf.shape(x).shape[0]) == 1:
+            x = tf.expand_dims(x, axis=1)
+        return (
+            -0.5
+            * tf.reduce_sum(
+                tf.math.square((tf.cast(x, tf.float32) - self.mean) / self.std), axis=1
+            )
+            - 0.5 * np.log(2.0 * np.pi) * tf.cast(tf.shape(x)[1], tf.float32)
+            - tf.reduce_sum(self.log_std, axis=1)
+        )
+
+    @override(ActionDistribution)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        assert isinstance(other, DiagGaussian)
+        return tf.reduce_sum(
+            other.log_std
+            - self.log_std
+            + (tf.math.square(self.std) + tf.math.square(self.mean - other.mean))
+            / (2.0 * tf.math.square(other.std))
+            - 0.5,
+            axis=1,
+        )
+
+    @override(ActionDistribution)
+    def entropy(self) -> TensorType:
+        return tf.reduce_sum(self.log_std + 0.5 * np.log(2.0 * np.pi * np.e), axis=1)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        sample = self.mean + self.std * tf.random.normal(tf.shape(self.mean))
+        if self.zero_action_dim:
+            return tf.squeeze(sample, axis=-1)
+        return sample
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32) * 2
+
+
+@OldAPIStack
+class SquashedGaussian(TFActionDistribution):
+    """A tanh-squashed Gaussian distribution defined by: mean, std, low, high.
+
+    The distribution will never return low or high exactly, but
+    `low`+SMALL_NUMBER or `high`-SMALL_NUMBER respectively.
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: ModelV2,
+        low: float = -1.0,
+        high: float = 1.0,
+    ):
+        """Parameterizes the distribution via `inputs`.
+
+        Args:
+            low: The lowest possible sampling value
+                (excluding this value).
+            high: The highest possible sampling value
+                (excluding this value).
+        """
+        assert tfp is not None
+        mean, log_std = tf.split(inputs, 2, axis=-1)
+        # Clip `scale` values (coming from NN) to reasonable values.
+        log_std = tf.clip_by_value(log_std, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT)
+        std = tf.exp(log_std)
+        self.distr = tfp.distributions.Normal(loc=mean, scale=std)
+        assert np.all(np.less(low, high))
+        self.low = low
+        self.high = high
+        super().__init__(inputs, model)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        mean = self.distr.mean()
+        return self._squash(mean)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        return self._squash(self.distr.sample())
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        # Unsquash values (from [low,high] to ]-inf,inf[)
+        unsquashed_values = tf.cast(self._unsquash(x), self.inputs.dtype)
+        # Get log prob of unsquashed values from our Normal.
+        log_prob_gaussian = self.distr.log_prob(unsquashed_values)
+        # For safety reasons, clamp somehow, only then sum up.
+        log_prob_gaussian = tf.clip_by_value(log_prob_gaussian, -100, 100)
+        log_prob_gaussian = tf.reduce_sum(log_prob_gaussian, axis=-1)
+        # Get log-prob for squashed Gaussian.
+        unsquashed_values_tanhd = tf.math.tanh(unsquashed_values)
+        log_prob = log_prob_gaussian - tf.reduce_sum(
+            tf.math.log(1 - unsquashed_values_tanhd**2 + SMALL_NUMBER), axis=-1
+        )
+        return log_prob
+
+    def sample_logp(self):
+        z = self.distr.sample()
+        actions = self._squash(z)
+        return actions, tf.reduce_sum(
+            self.distr.log_prob(z) - tf.math.log(1 - actions * actions + SMALL_NUMBER),
+            axis=-1,
+        )
+
+    @override(ActionDistribution)
+    def entropy(self) -> TensorType:
+        raise ValueError("Entropy not defined for SquashedGaussian!")
+
+    @override(ActionDistribution)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        raise ValueError("KL not defined for SquashedGaussian!")
+
+    def _squash(self, raw_values: TensorType) -> TensorType:
+        # Returned values are within [low, high] (including `low` and `high`).
+        squashed = ((tf.math.tanh(raw_values) + 1.0) / 2.0) * (
+            self.high - self.low
+        ) + self.low
+        return tf.clip_by_value(squashed, self.low, self.high)
+
+    def _unsquash(self, values: TensorType) -> TensorType:
+        normed_values = (values - self.low) / (self.high - self.low) * 2.0 - 1.0
+        # Stabilize input to atanh.
+        save_normed_values = tf.clip_by_value(
+            normed_values, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER
+        )
+        unsquashed = tf.math.atanh(save_normed_values)
+        return unsquashed
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32) * 2
+
+
+@OldAPIStack
+class Beta(TFActionDistribution):
+    """
+    A Beta distribution is defined on the interval [0, 1] and parameterized by
+    shape parameters alpha and beta (also called concentration parameters).
+
+    PDF(x; alpha, beta) = x**(alpha - 1) (1 - x)**(beta - 1) / Z
+        with Z = Gamma(alpha) Gamma(beta) / Gamma(alpha + beta)
+        and Gamma(n) = (n - 1)!
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorType],
+        model: ModelV2,
+        low: float = 0.0,
+        high: float = 1.0,
+    ):
+        # Stabilize input parameters (possibly coming from a linear layer).
+        inputs = tf.clip_by_value(inputs, log(SMALL_NUMBER), -log(SMALL_NUMBER))
+        inputs = tf.math.log(tf.math.exp(inputs) + 1.0) + 1.0
+        self.low = low
+        self.high = high
+        alpha, beta = tf.split(inputs, 2, axis=-1)
+        # Note: concentration0==beta, concentration1=alpha (!)
+        self.dist = tfp.distributions.Beta(concentration1=alpha, concentration0=beta)
+        super().__init__(inputs, model)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        mean = self.dist.mean()
+        return self._squash(mean)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        return self._squash(self.dist.sample())
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        unsquashed_values = self._unsquash(x)
+        return tf.math.reduce_sum(self.dist.log_prob(unsquashed_values), axis=-1)
+
+    def _squash(self, raw_values: TensorType) -> TensorType:
+        return raw_values * (self.high - self.low) + self.low
+
+    def _unsquash(self, values: TensorType) -> TensorType:
+        return (values - self.low) / (self.high - self.low)
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32) * 2
+
+
+@OldAPIStack
+class Deterministic(TFActionDistribution):
+    """Action distribution that returns the input values directly.
+
+    This is similar to DiagGaussian with standard deviation zero (thus only
+    requiring the "mean" values as NN output).
+    """
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        return self.inputs
+
+    @override(TFActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        return tf.zeros_like(self.inputs)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        return self.inputs
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32)
+
+
+@OldAPIStack
+class MultiActionDistribution(TFActionDistribution):
+    """Action distribution that operates on a set of actions.
+
+    Args:
+        inputs (Tensor list): A list of tensors from which to compute samples.
+    """
+
+    def __init__(
+        self, inputs, model, *, child_distributions, input_lens, action_space, **kwargs
+    ):
+        ActionDistribution.__init__(self, inputs, model)
+
+        self.action_space_struct = get_base_struct_from_space(action_space)
+
+        self.input_lens = np.array(input_lens, dtype=np.int32)
+        split_inputs = tf.split(inputs, self.input_lens, axis=1)
+        self.flat_child_distributions = tree.map_structure(
+            lambda dist, input_: dist(input_, model, **kwargs),
+            child_distributions,
+            split_inputs,
+        )
+
+    @override(ActionDistribution)
+    def logp(self, x):
+        # Single tensor input (all merged).
+        if isinstance(x, (tf.Tensor, np.ndarray)):
+            split_indices = []
+            for dist in self.flat_child_distributions:
+                if isinstance(dist, Categorical):
+                    split_indices.append(1)
+                elif (
+                    isinstance(dist, MultiCategorical) and dist.action_space is not None
+                ):
+                    split_indices.append(np.prod(dist.action_space.shape))
+                else:
+                    sample = dist.sample()
+                    # Cover Box(shape=()) case.
+                    if len(sample.shape) == 1:
+                        split_indices.append(1)
+                    else:
+                        split_indices.append(tf.shape(sample)[1])
+            split_x = tf.split(x, split_indices, axis=1)
+        # Structured or flattened (by single action component) input.
+        else:
+            split_x = tree.flatten(x)
+
+        def map_(val, dist):
+            # Remove extra categorical dimension.
+            if isinstance(dist, Categorical):
+                val = tf.cast(
+                    tf.squeeze(val, axis=-1) if len(val.shape) > 1 else val, tf.int32
+                )
+            return dist.logp(val)
+
+        # Remove extra categorical dimension and take the logp of each
+        # component.
+        flat_logps = tree.map_structure(map_, split_x, self.flat_child_distributions)
+
+        return functools.reduce(lambda a, b: a + b, flat_logps)
+
+    @override(ActionDistribution)
+    def kl(self, other):
+        kl_list = [
+            d.kl(o)
+            for d, o in zip(
+                self.flat_child_distributions, other.flat_child_distributions
+            )
+        ]
+        return functools.reduce(lambda a, b: a + b, kl_list)
+
+    @override(ActionDistribution)
+    def entropy(self):
+        entropy_list = [d.entropy() for d in self.flat_child_distributions]
+        return functools.reduce(lambda a, b: a + b, entropy_list)
+
+    @override(ActionDistribution)
+    def sample(self):
+        child_distributions = tree.unflatten_as(
+            self.action_space_struct, self.flat_child_distributions
+        )
+        return tree.map_structure(lambda s: s.sample(), child_distributions)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self):
+        child_distributions = tree.unflatten_as(
+            self.action_space_struct, self.flat_child_distributions
+        )
+        return tree.map_structure(
+            lambda s: s.deterministic_sample(), child_distributions
+        )
+
+    @override(TFActionDistribution)
+    def sampled_action_logp(self):
+        p = self.flat_child_distributions[0].sampled_action_logp()
+        for c in self.flat_child_distributions[1:]:
+            p += c.sampled_action_logp()
+        return p
+
+    @override(ActionDistribution)
+    def required_model_output_shape(self, action_space, model_config):
+        return np.sum(self.input_lens, dtype=np.int32)
+
+
+@OldAPIStack
+class Dirichlet(TFActionDistribution):
+    """Dirichlet distribution for continuous actions that are between
+    [0,1] and sum to 1.
+
+    e.g. actions that represent resource allocation."""
+
+    def __init__(self, inputs: List[TensorType], model: ModelV2):
+        """Input is a tensor of logits. The exponential of logits is used to
+        parametrize the Dirichlet distribution as all parameters need to be
+        positive. An arbitrary small epsilon is added to the concentration
+        parameters to be zero due to numerical error.
+
+        See issue #4440 for more details.
+        """
+        self.epsilon = 1e-7
+        concentration = tf.exp(inputs) + self.epsilon
+        self.dist = tf1.distributions.Dirichlet(
+            concentration=concentration,
+            validate_args=True,
+            allow_nan_stats=False,
+        )
+        super().__init__(concentration, model)
+
+    @override(ActionDistribution)
+    def deterministic_sample(self) -> TensorType:
+        return tf.nn.softmax(self.dist.concentration)
+
+    @override(ActionDistribution)
+    def logp(self, x: TensorType) -> TensorType:
+        # Support of Dirichlet are positive real numbers. x is already
+        # an array of positive numbers, but we clip to avoid zeros due to
+        # numerical errors.
+        x = tf.maximum(x, self.epsilon)
+        x = x / tf.reduce_sum(x, axis=-1, keepdims=True)
+        return self.dist.log_prob(x)
+
+    @override(ActionDistribution)
+    def entropy(self) -> TensorType:
+        return self.dist.entropy()
+
+    @override(ActionDistribution)
+    def kl(self, other: ActionDistribution) -> TensorType:
+        return self.dist.kl_divergence(other.dist)
+
+    @override(TFActionDistribution)
+    def _build_sample_op(self) -> TensorType:
+        return self.dist.sample()
+
+    @staticmethod
+    @override(ActionDistribution)
+    def required_model_output_shape(
+        action_space: gym.Space, model_config: ModelConfigDict
+    ) -> Union[int, np.ndarray]:
+        return np.prod(action_space.shape, dtype=np.int32)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_distributions.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..a99898f53e7f63b4c6a6d2e9ceb95dfc041f940c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_distributions.py
@@ -0,0 +1,552 @@
+"""The main difference between this and the old ActionDistribution is that this one
+has more explicit input args. So that the input format does not have to be guessed from
+the code. This matches the design pattern of torch distribution which developers may
+already be familiar with.
+"""
+import gymnasium as gym
+import tree
+import numpy as np
+from typing import Dict, Iterable, List, Optional
+import abc
+
+
+from ray.rllib.models.distributions import Distribution
+from ray.rllib.utils.annotations import override, DeveloperAPI
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+from ray.rllib.utils.typing import TensorType, Union, Tuple
+
+
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
+
+# TODO (Kourosh) Write unittest for this class similar to torch distributions.
+
+
+@DeveloperAPI
+class TfDistribution(Distribution, abc.ABC):
+    """Wrapper class for tfp.distributions."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self._dist = self._get_tf_distribution(*args, **kwargs)
+
+    @abc.abstractmethod
+    def _get_tf_distribution(self, *args, **kwargs) -> "tfp.distributions.Distribution":
+        """Returns the tfp.distributions.Distribution object to use."""
+
+    @override(Distribution)
+    def logp(self, value: TensorType, **kwargs) -> TensorType:
+        return self._dist.log_prob(value, **kwargs)
+
+    @override(Distribution)
+    def entropy(self) -> TensorType:
+        return self._dist.entropy()
+
+    @override(Distribution)
+    def kl(self, other: "Distribution") -> TensorType:
+        return self._dist.kl_divergence(other._dist)
+
+    @override(Distribution)
+    def sample(
+        self, *, sample_shape=()
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        sample = self._dist.sample(sample_shape)
+        return sample
+
+    @override(Distribution)
+    def rsample(
+        self, *, sample_shape=()
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        raise NotImplementedError
+
+
+@DeveloperAPI
+class TfCategorical(TfDistribution):
+    """Wrapper class for Categorical distribution.
+
+    Creates a categorical distribution parameterized by either :attr:`probs` or
+    :attr:`logits` (but not both).
+
+    Samples are integers from :math:`\{0, \ldots, K-1\}` where `K` is
+    ``probs.size(-1)``.
+
+    If `probs` is 1-dimensional with length-`K`, each element is the relative
+    probability of sampling the class at that index.
+
+    If `probs` is N-dimensional, the first N-1 dimensions are treated as a batch of
+    relative probability vectors.
+
+    .. testcode::
+        :skipif: True
+
+        m = TfCategorical([ 0.25, 0.25, 0.25, 0.25 ])
+        m.sample(sample_shape=(2,))  # equal probability of 0, 1, 2, 3
+
+    .. testoutput::
+
+        tf.Tensor([2 3], shape=(2,), dtype=int32)
+
+    Args:
+        probs: The probablities of each event.
+        logits: Event log probabilities (unnormalized)
+        temperature: In case of using logits, this parameter can be used to determine
+            the sharpness of the distribution. i.e.
+            ``probs = softmax(logits / temperature)``. The temperature must be strictly
+            positive. A low value (e.g. 1e-10) will result in argmax sampling while a
+            larger value will result in uniform sampling.
+    """
+
+    @override(TfDistribution)
+    def __init__(
+        self,
+        probs: "tf.Tensor" = None,
+        logits: "tf.Tensor" = None,
+    ) -> None:
+        # We assert this here because to_deterministic makes this assumption.
+        assert (probs is None) != (
+            logits is None
+        ), "Exactly one out of `probs` and `logits` must be set!"
+
+        self.probs = probs
+        self.logits = logits
+        self.one_hot = tfp.distributions.OneHotCategorical(logits=logits, probs=probs)
+        super().__init__(logits=logits, probs=probs)
+
+    @override(Distribution)
+    def logp(self, value: TensorType, **kwargs) -> TensorType:
+        # This prevents an error in which float values at the boundaries of the range
+        # of the distribution are passed to this function.
+        return -tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=self.logits if self.logits is not None else tf.log(self.probs),
+            labels=tf.cast(value, tf.int32),
+        )
+
+    @override(TfDistribution)
+    def _get_tf_distribution(
+        self,
+        probs: "tf.Tensor" = None,
+        logits: "tf.Tensor" = None,
+    ) -> "tfp.distributions.Distribution":
+        return tfp.distributions.Categorical(probs=probs, logits=logits)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.Discrete)
+        return int(space.n)
+
+    @override(Distribution)
+    def rsample(self, sample_shape=()):
+        one_hot_sample = self.one_hot.sample(sample_shape)
+        return tf.stop_gradients(one_hot_sample - self.probs) + self.probs
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(cls, logits: TensorType, **kwargs) -> "TfCategorical":
+        return TfCategorical(logits=logits, **kwargs)
+
+    def to_deterministic(self) -> "TfDeterministic":
+        if self.probs is not None:
+            probs_or_logits = self.probs
+        else:
+            probs_or_logits = self.logits
+
+        return TfDeterministic(loc=tf.math.argmax(probs_or_logits, axis=-1))
+
+
+@DeveloperAPI
+class TfDiagGaussian(TfDistribution):
+    """Wrapper class for Normal distribution.
+
+    Creates a normal distribution parameterized by :attr:`loc` and :attr:`scale`. In
+    case of multi-dimensional distribution, the variance is assumed to be diagonal.
+
+    .. testcode::
+        :skipif: True
+
+        m = TfDiagGaussian(loc=[0.0, 0.0], scale=[1.0, 1.0])
+        m.sample(sample_shape=(2,))  # 2d normal dist with loc=0 and scale=1
+
+    .. testoutput::
+
+        tensor([[ 0.1046, -0.6120], [ 0.234, 0.556]])
+
+    .. testcode::
+        :skipif: True
+
+        # scale is None
+        m = TfDiagGaussian(loc=[0.0, 1.0])
+        m.sample(sample_shape=(2,))  # normally distributed with loc=0 and scale=1
+
+    .. testoutput::
+
+        tensor([0.1046, 0.6120])
+
+
+    Args:
+        loc: mean of the distribution (often referred to as mu). If scale is None, the
+            second half of the `loc` will be used as the log of scale.
+        scale: standard deviation of the distribution (often referred to as sigma).
+            Has to be positive.
+    """
+
+    @override(TfDistribution)
+    def __init__(
+        self,
+        loc: Union[float, TensorType],
+        scale: Optional[Union[float, TensorType]] = None,
+    ):
+        self.loc = loc
+        super().__init__(loc=loc, scale=scale)
+
+    @override(TfDistribution)
+    def _get_tf_distribution(self, loc, scale) -> "tfp.distributions.Distribution":
+        return tfp.distributions.Normal(loc=loc, scale=scale)
+
+    @override(TfDistribution)
+    def logp(self, value: TensorType) -> TensorType:
+        return tf.math.reduce_sum(super().logp(value), axis=-1)
+
+    @override(TfDistribution)
+    def entropy(self) -> TensorType:
+        return tf.math.reduce_sum(super().entropy(), axis=-1)
+
+    @override(TfDistribution)
+    def kl(self, other: "TfDistribution") -> TensorType:
+        return tf.math.reduce_sum(super().kl(other), axis=-1)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.Box)
+        return int(np.prod(space.shape, dtype=np.int32) * 2)
+
+    @override(Distribution)
+    def rsample(self, sample_shape=()):
+        eps = tf.random.normal(sample_shape)
+        return self._dist.loc + eps * self._dist.scale
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(cls, logits: TensorType, **kwargs) -> "TfDiagGaussian":
+        loc, log_std = tf.split(logits, num_or_size_splits=2, axis=-1)
+        scale = tf.math.exp(log_std)
+        return TfDiagGaussian(loc=loc, scale=scale)
+
+    def to_deterministic(self) -> "TfDeterministic":
+        return TfDeterministic(loc=self.loc)
+
+
+@DeveloperAPI
+class TfDeterministic(Distribution):
+    """The distribution that returns the input values directly.
+
+    This is similar to DiagGaussian with standard deviation zero (thus only
+    requiring the "mean" values as NN output).
+
+    Note: entropy is always zero, ang logp and kl are not implemented.
+
+    .. testcode::
+        :skipif: True
+
+        m = TfDeterministic(loc=tf.constant([0.0, 0.0]))
+        m.sample(sample_shape=(2,))
+
+    .. testoutput::
+
+        Tensor([[ 0.0, 0.0], [ 0.0, 0.0]])
+
+    Args:
+        loc: the determinsitic value to return
+    """
+
+    @override(Distribution)
+    def __init__(self, loc: "tf.Tensor") -> None:
+        super().__init__()
+        self.loc = loc
+
+    @override(Distribution)
+    def sample(
+        self,
+        *,
+        sample_shape: Tuple[int, ...] = (),
+        **kwargs,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        shape = sample_shape + self.loc.shape
+        return tf.ones(shape, dtype=self.loc.dtype) * self.loc
+
+    @override(Distribution)
+    def rsample(
+        self,
+        *,
+        sample_shape: Tuple[int, ...] = None,
+        **kwargs,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        raise NotImplementedError
+
+    @override(Distribution)
+    def logp(self, value: TensorType, **kwargs) -> TensorType:
+        return tf.zeros_like(self.loc)
+
+    @override(Distribution)
+    def entropy(self, **kwargs) -> TensorType:
+        raise RuntimeError(f"`entropy()` not supported for {self.__class__.__name__}.")
+
+    @override(Distribution)
+    def kl(self, other: "Distribution", **kwargs) -> TensorType:
+        raise RuntimeError(f"`kl()` not supported for {self.__class__.__name__}.")
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.Box)
+        return int(np.prod(space.shape, dtype=np.int32))
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(cls, logits: TensorType, **kwargs) -> "TfDeterministic":
+        return TfDeterministic(loc=logits)
+
+    def to_deterministic(self) -> "TfDeterministic":
+        return self
+
+
+@DeveloperAPI
+class TfMultiCategorical(Distribution):
+    """MultiCategorical distribution for MultiDiscrete action spaces."""
+
+    @override(Distribution)
+    def __init__(
+        self,
+        categoricals: List[TfCategorical],
+    ):
+        super().__init__()
+        self._cats = categoricals
+
+    @override(Distribution)
+    def sample(self) -> TensorType:
+        arr = [cat.sample() for cat in self._cats]
+        sample_ = tf.stack(arr, axis=-1)
+        return sample_
+
+    @override(Distribution)
+    def rsample(self, sample_shape=()):
+        arr = [cat.rsample() for cat in self._cats]
+        sample_ = tf.stack(arr, axis=-1)
+        return sample_
+
+    @override(Distribution)
+    def logp(self, value: tf.Tensor) -> TensorType:
+        actions = tf.unstack(tf.cast(value, tf.int32), axis=-1)
+        logps = tf.stack([cat.logp(act) for cat, act in zip(self._cats, actions)])
+        return tf.reduce_sum(logps, axis=0)
+
+    @override(Distribution)
+    def entropy(self) -> TensorType:
+        return tf.reduce_sum(
+            tf.stack([cat.entropy() for cat in self._cats], axis=-1), axis=-1
+        )
+
+    @override(Distribution)
+    def kl(self, other: Distribution) -> TensorType:
+        kls = tf.stack(
+            [cat.kl(oth_cat) for cat, oth_cat in zip(self._cats, other._cats)], axis=-1
+        )
+        return tf.reduce_sum(kls, axis=-1)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, **kwargs) -> int:
+        assert isinstance(space, gym.spaces.MultiDiscrete)
+        return int(np.sum(space.nvec))
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(
+        cls,
+        logits: tf.Tensor,
+        input_lens: List[int],
+        **kwargs,
+    ) -> "TfMultiCategorical":
+        """Creates this Distribution from logits (and additional arguments).
+
+        If you wish to create this distribution from logits only, please refer to
+        `Distribution.get_partial_dist_cls()`.
+
+        Args:
+            logits: The tensor containing logits to be separated by logit_lens.
+                child_distribution_cls_struct: A struct of Distribution classes that can
+                be instantiated from the given logits.
+            input_lens: A list of integers that indicate the length of the logits
+                vectors to be passed into each child distribution.
+            **kwargs: Forward compatibility kwargs.
+        """
+        categoricals = [
+            TfCategorical(logits=logits)
+            for logits in tf.split(logits, input_lens, axis=-1)
+        ]
+
+        return TfMultiCategorical(categoricals=categoricals)
+
+    def to_deterministic(self) -> "TfMultiDistribution":
+        return TfMultiDistribution([cat.to_deterministic() for cat in self._cats])
+
+
+@DeveloperAPI
+class TfMultiDistribution(Distribution):
+    """Action distribution that operates on multiple, possibly nested actions."""
+
+    def __init__(
+        self,
+        child_distribution_struct: Union[Tuple, List, Dict],
+    ):
+        """Initializes a TfMultiDistribution object.
+
+        Args:
+            child_distribution_struct: Any struct
+                that contains the child distribution classes to use to
+                instantiate the child distributions from `logits`.
+        """
+        super().__init__()
+        self._original_struct = child_distribution_struct
+        self._flat_child_distributions = tree.flatten(child_distribution_struct)
+
+    @override(Distribution)
+    def rsample(
+        self,
+        *,
+        sample_shape: Tuple[int, ...] = None,
+        **kwargs,
+    ) -> Union[TensorType, Tuple[TensorType, TensorType]]:
+        rsamples = []
+        for dist in self._flat_child_distributions:
+            rsample = dist.rsample(sample_shape=sample_shape, **kwargs)
+            rsamples.append(rsample)
+
+        rsamples = tree.unflatten_as(self._original_struct, rsamples)
+        return rsamples
+
+    @override(Distribution)
+    def logp(self, value):
+        # Single tensor input (all merged).
+        if isinstance(value, (tf.Tensor, np.ndarray)):
+            split_indices = []
+            for dist in self._flat_child_distributions:
+                if isinstance(dist, TfCategorical):
+                    split_indices.append(1)
+                elif isinstance(dist, TfMultiCategorical):
+                    split_indices.append(len(dist._cats))
+                else:
+                    sample = dist.sample()
+                    # Cover Box(shape=()) case.
+                    if len(sample.shape) == 1:
+                        split_indices.append(1)
+                    else:
+                        split_indices.append(tf.shape(sample)[1])
+            split_value = tf.split(value, split_indices, axis=1)
+        # Structured or flattened (by single action component) input.
+        else:
+            split_value = tree.flatten(value)
+
+        def map_(val, dist):
+            # Remove extra dimension if present.
+            if (
+                isinstance(dist, TfCategorical)
+                and len(val.shape) > 1
+                and val.shape[-1] == 1
+            ):
+                val = tf.squeeze(val, axis=-1)
+
+            return dist.logp(val)
+
+        # Remove extra categorical dimension and take the logp of each
+        # component.
+        flat_logps = tree.map_structure(
+            map_, split_value, self._flat_child_distributions
+        )
+
+        return sum(flat_logps)
+
+    @override(Distribution)
+    def kl(self, other):
+        kl_list = [
+            d.kl(o)
+            for d, o in zip(
+                self._flat_child_distributions, other._flat_child_distributions
+            )
+        ]
+        return sum(kl_list)
+
+    @override(Distribution)
+    def entropy(self):
+        entropy_list = [d.entropy() for d in self._flat_child_distributions]
+        return sum(entropy_list)
+
+    @override(Distribution)
+    def sample(self):
+        child_distributions_struct = tree.unflatten_as(
+            self._original_struct, self._flat_child_distributions
+        )
+        return tree.map_structure(lambda s: s.sample(), child_distributions_struct)
+
+    @staticmethod
+    @override(Distribution)
+    def required_input_dim(space: gym.Space, input_lens: List[int], **kwargs) -> int:
+        return sum(input_lens)
+
+    @classmethod
+    @override(Distribution)
+    def from_logits(
+        cls,
+        logits: tf.Tensor,
+        child_distribution_cls_struct: Union[Dict, Iterable],
+        input_lens: Union[Dict, List[int]],
+        space: gym.Space,
+        **kwargs,
+    ) -> "TfMultiDistribution":
+        """Creates this Distribution from logits (and additional arguments).
+
+        If you wish to create this distribution from logits only, please refer to
+        `Distribution.get_partial_dist_cls()`.
+
+        Args:
+            logits: The tensor containing logits to be separated by `input_lens`.
+                child_distribution_cls_struct: A struct of Distribution classes that can
+                be instantiated from the given logits.
+            child_distribution_cls_struct: A struct of Distribution classes that can
+                be instantiated from the given logits.
+            input_lens: A list or dict of integers that indicate the length of each
+                logit. If this is given as a dict, the structure should match the
+                structure of child_distribution_cls_struct.
+            space: The possibly nested output space.
+            **kwargs: Forward compatibility kwargs.
+
+        Returns:
+            A TfMultiDistribution object.
+        """
+        logit_lens = tree.flatten(input_lens)
+        child_distribution_cls_list = tree.flatten(child_distribution_cls_struct)
+        split_logits = tf.split(logits, logit_lens, axis=1)
+
+        child_distribution_list = tree.map_structure(
+            lambda dist, input_: dist.from_logits(input_),
+            child_distribution_cls_list,
+            list(split_logits),
+        )
+
+        child_distribution_struct = tree.unflatten_as(
+            child_distribution_cls_struct, child_distribution_list
+        )
+
+        return TfMultiDistribution(
+            child_distribution_struct=child_distribution_struct,
+        )
+
+    def to_deterministic(self) -> "TfMultiDistribution":
+        flat_deterministic_dists = [
+            dist.to_deterministic for dist in self._flat_child_distributions
+        ]
+        deterministic_dists = tree.unflatten_as(
+            self._original_struct, flat_deterministic_dists
+        )
+        return TfMultiDistribution(deterministic_dists)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_modelv2.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_modelv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7438796944248b443b0e3c91332275745a1ab467
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/tf_modelv2.py
@@ -0,0 +1,142 @@
+import contextlib
+import gymnasium as gym
+import re
+from typing import Dict, List, Union
+
+from ray.util import log_once
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+
+tf1, tf, tfv = try_import_tf()
+
+
+@OldAPIStack
+class TFModelV2(ModelV2):
+    """TF version of ModelV2, which should contain a tf keras Model.
+
+    Note that this class by itself is not a valid model unless you
+    implement forward() in a subclass."""
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        """Initializes a TFModelV2 instance.
+
+        Here is an example implementation for a subclass
+        ``MyModelClass(TFModelV2)``::
+
+            def __init__(self, *args, **kwargs):
+                super(MyModelClass, self).__init__(*args, **kwargs)
+                input_layer = tf.keras.layers.Input(...)
+                hidden_layer = tf.keras.layers.Dense(...)(input_layer)
+                output_layer = tf.keras.layers.Dense(...)(hidden_layer)
+                value_layer = tf.keras.layers.Dense(...)(hidden_layer)
+                self.base_model = tf.keras.Model(
+                    input_layer, [output_layer, value_layer])
+        """
+        super().__init__(
+            obs_space, action_space, num_outputs, model_config, name, framework="tf"
+        )
+
+        # Deprecated: TFModelV2 now automatically track their variables.
+        self.var_list = []
+
+        if tf1.executing_eagerly():
+            self.graph = None
+        else:
+            self.graph = tf1.get_default_graph()
+
+    def context(self) -> contextlib.AbstractContextManager:
+        """Returns a contextmanager for the current TF graph."""
+        if self.graph:
+            return self.graph.as_default()
+        else:
+            return ModelV2.context(self)
+
+    def update_ops(self) -> List[TensorType]:
+        """Return the list of update ops for this model.
+
+        For example, this should include any BatchNorm update ops."""
+        return []
+
+    def register_variables(self, variables: List[TensorType]) -> None:
+        """Register the given list of variables with this model."""
+        if log_once("deprecated_tfmodelv2_register_variables"):
+            deprecation_warning(old="TFModelV2.register_variables", error=False)
+        self.var_list.extend(variables)
+
+    @override(ModelV2)
+    def variables(
+        self, as_dict: bool = False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
+        if as_dict:
+            # Old way using `register_variables`.
+            if self.var_list:
+                return {v.name: v for v in self.var_list}
+            # New way: Automatically determine the var tree.
+            else:
+                return self._find_sub_modules("", self.__dict__)
+
+        # Old way using `register_variables`.
+        if self.var_list:
+            return list(self.var_list)
+        # New way: Automatically determine the var tree.
+        else:
+            return list(self.variables(as_dict=True).values())
+
+    @override(ModelV2)
+    def trainable_variables(
+        self, as_dict: bool = False
+    ) -> Union[List[TensorType], Dict[str, TensorType]]:
+        if as_dict:
+            return {
+                k: v for k, v in self.variables(as_dict=True).items() if v.trainable
+            }
+        return [v for v in self.variables() if v.trainable]
+
+    @staticmethod
+    def _find_sub_modules(current_key, struct):
+        # Keras Model: key=k + "." + var-name (replace '/' by '.').
+        if isinstance(struct, tf.keras.models.Model) or isinstance(struct, tf.Module):
+            ret = {}
+            for var in struct.variables:
+                name = re.sub("/", ".", var.name)
+                key = current_key + "." + name
+                ret[key] = var
+            return ret
+        # Other TFModelV2: Include its vars into ours.
+        elif isinstance(struct, TFModelV2):
+            return {
+                current_key + "." + key: var
+                for key, var in struct.variables(as_dict=True).items()
+            }
+        # tf.Variable
+        elif isinstance(struct, tf.Variable):
+            return {current_key: struct}
+        # List/Tuple.
+        elif isinstance(struct, (tuple, list)):
+            ret = {}
+            for i, value in enumerate(struct):
+                sub_vars = TFModelV2._find_sub_modules(
+                    current_key + "_{}".format(i), value
+                )
+                ret.update(sub_vars)
+            return ret
+        # Dict.
+        elif isinstance(struct, dict):
+            if current_key:
+                current_key += "_"
+            ret = {}
+            for key, value in struct.items():
+                sub_vars = TFModelV2._find_sub_modules(current_key + str(key), value)
+                ret.update(sub_vars)
+            return ret
+        return {}
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/visionnet.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/visionnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..69124c9e2e61ef48272dfcbae99503db13a98b07
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/tf/visionnet.py
@@ -0,0 +1,264 @@
+import gymnasium as gym
+from typing import Dict, List
+
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.misc import normc_initializer
+from ray.rllib.models.utils import get_activation_fn, get_filter_config
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+
+tf1, tf, tfv = try_import_tf()
+
+
+@OldAPIStack
+class VisionNetwork(TFModelV2):
+    """Generic vision network implemented in ModelV2 API.
+
+    An additional post-conv fully connected stack can be added and configured
+    via the config keys:
+    `post_fcnet_hiddens`: Dense layer sizes after the Conv2D stack.
+    `post_fcnet_activation`: Activation function to use for this FC stack.
+    """
+
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+    ):
+        if not model_config.get("conv_filters"):
+            model_config["conv_filters"] = get_filter_config(obs_space.shape)
+
+        super(VisionNetwork, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+
+        activation = get_activation_fn(
+            self.model_config.get("conv_activation"), framework="tf"
+        )
+        filters = self.model_config["conv_filters"]
+        assert len(filters) > 0, "Must provide at least 1 entry in `conv_filters`!"
+
+        # Post FC net config.
+        post_fcnet_hiddens = model_config.get("post_fcnet_hiddens", [])
+        post_fcnet_activation = get_activation_fn(
+            model_config.get("post_fcnet_activation"), framework="tf"
+        )
+
+        no_final_linear = self.model_config.get("no_final_linear")
+        vf_share_layers = self.model_config.get("vf_share_layers")
+
+        input_shape = obs_space.shape
+        self.data_format = "channels_last"
+
+        inputs = tf.keras.layers.Input(shape=input_shape, name="observations")
+        last_layer = inputs
+        # Whether the last layer is the output of a Flattened (rather than
+        # a n x (1,1) Conv2D).
+        self.last_layer_is_flattened = False
+
+        # Build the action layers
+        for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
+            last_layer = tf.keras.layers.Conv2D(
+                out_size,
+                kernel,
+                strides=stride
+                if isinstance(stride, (list, tuple))
+                else (stride, stride),
+                activation=activation,
+                padding="same",
+                data_format="channels_last",
+                name="conv{}".format(i),
+            )(last_layer)
+
+        out_size, kernel, stride = filters[-1]
+
+        # No final linear: Last layer has activation function and exits with
+        # num_outputs nodes (this could be a 1x1 conv or a FC layer, depending
+        # on `post_fcnet_...` settings).
+        if no_final_linear and num_outputs:
+            last_layer = tf.keras.layers.Conv2D(
+                out_size if post_fcnet_hiddens else num_outputs,
+                kernel,
+                strides=stride
+                if isinstance(stride, (list, tuple))
+                else (stride, stride),
+                activation=activation,
+                padding="valid",
+                data_format="channels_last",
+                name="conv_out",
+            )(last_layer)
+            # Add (optional) post-fc-stack after last Conv2D layer.
+            layer_sizes = post_fcnet_hiddens[:-1] + (
+                [num_outputs] if post_fcnet_hiddens else []
+            )
+            feature_out = last_layer
+
+            for i, out_size in enumerate(layer_sizes):
+                feature_out = last_layer
+                last_layer = tf.keras.layers.Dense(
+                    out_size,
+                    name="post_fcnet_{}".format(i),
+                    activation=post_fcnet_activation,
+                    kernel_initializer=normc_initializer(1.0),
+                )(last_layer)
+
+        # Finish network normally (w/o overriding last layer size with
+        # `num_outputs`), then add another linear one of size `num_outputs`.
+        else:
+            last_layer = tf.keras.layers.Conv2D(
+                out_size,
+                kernel,
+                strides=stride
+                if isinstance(stride, (list, tuple))
+                else (stride, stride),
+                activation=activation,
+                padding="valid",
+                data_format="channels_last",
+                name="conv{}".format(len(filters)),
+            )(last_layer)
+
+            # num_outputs defined. Use that to create an exact
+            # `num_output`-sized (1,1)-Conv2D.
+            if num_outputs:
+                if post_fcnet_hiddens:
+                    last_cnn = last_layer = tf.keras.layers.Conv2D(
+                        post_fcnet_hiddens[0],
+                        [1, 1],
+                        activation=post_fcnet_activation,
+                        padding="same",
+                        data_format="channels_last",
+                        name="conv_out",
+                    )(last_layer)
+                    # Add (optional) post-fc-stack after last Conv2D layer.
+                    for i, out_size in enumerate(
+                        post_fcnet_hiddens[1:] + [num_outputs]
+                    ):
+                        feature_out = last_layer
+                        last_layer = tf.keras.layers.Dense(
+                            out_size,
+                            name="post_fcnet_{}".format(i + 1),
+                            activation=post_fcnet_activation
+                            if i < len(post_fcnet_hiddens) - 1
+                            else None,
+                            kernel_initializer=normc_initializer(1.0),
+                        )(last_layer)
+                else:
+                    feature_out = last_layer
+                    last_cnn = last_layer = tf.keras.layers.Conv2D(
+                        num_outputs,
+                        [1, 1],
+                        activation=None,
+                        padding="same",
+                        data_format="channels_last",
+                        name="conv_out",
+                    )(last_layer)
+
+                if last_cnn.shape[1] != 1 or last_cnn.shape[2] != 1:
+                    raise ValueError(
+                        "Given `conv_filters` ({}) do not result in a [B, 1, "
+                        "1, {} (`num_outputs`)] shape (but in {})! Please "
+                        "adjust your Conv2D stack such that the dims 1 and 2 "
+                        "are both 1.".format(
+                            self.model_config["conv_filters"],
+                            self.num_outputs,
+                            list(last_cnn.shape),
+                        )
+                    )
+
+            # num_outputs not known -> Flatten, then set self.num_outputs
+            # to the resulting number of nodes.
+            else:
+                self.last_layer_is_flattened = True
+                last_layer = tf.keras.layers.Flatten(data_format="channels_last")(
+                    last_layer
+                )
+
+                # Add (optional) post-fc-stack after last Conv2D layer.
+                for i, out_size in enumerate(post_fcnet_hiddens):
+                    last_layer = tf.keras.layers.Dense(
+                        out_size,
+                        name="post_fcnet_{}".format(i),
+                        activation=post_fcnet_activation,
+                        kernel_initializer=normc_initializer(1.0),
+                    )(last_layer)
+                feature_out = last_layer
+                self.num_outputs = last_layer.shape[1]
+        logits_out = last_layer
+
+        # Build the value layers
+        if vf_share_layers:
+            if not self.last_layer_is_flattened:
+                feature_out = tf.keras.layers.Lambda(
+                    lambda x: tf.squeeze(x, axis=[1, 2])
+                )(feature_out)
+            value_out = tf.keras.layers.Dense(
+                1,
+                name="value_out",
+                activation=None,
+                kernel_initializer=normc_initializer(0.01),
+            )(feature_out)
+        else:
+            # build a parallel set of hidden layers for the value net
+            last_layer = inputs
+            for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
+                last_layer = tf.keras.layers.Conv2D(
+                    out_size,
+                    kernel,
+                    strides=stride
+                    if isinstance(stride, (list, tuple))
+                    else (stride, stride),
+                    activation=activation,
+                    padding="same",
+                    data_format="channels_last",
+                    name="conv_value_{}".format(i),
+                )(last_layer)
+            out_size, kernel, stride = filters[-1]
+            last_layer = tf.keras.layers.Conv2D(
+                out_size,
+                kernel,
+                strides=stride
+                if isinstance(stride, (list, tuple))
+                else (stride, stride),
+                activation=activation,
+                padding="valid",
+                data_format="channels_last",
+                name="conv_value_{}".format(len(filters)),
+            )(last_layer)
+            last_layer = tf.keras.layers.Conv2D(
+                1,
+                [1, 1],
+                activation=None,
+                padding="same",
+                data_format="channels_last",
+                name="conv_value_out",
+            )(last_layer)
+            value_out = tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=[1, 2]))(
+                last_layer
+            )
+
+        self.base_model = tf.keras.Model(inputs, [logits_out, value_out])
+
+    def forward(
+        self,
+        input_dict: Dict[str, TensorType],
+        state: List[TensorType],
+        seq_lens: TensorType,
+    ) -> (TensorType, List[TensorType]):
+        obs = input_dict["obs"]
+        if self.data_format == "channels_first":
+            obs = tf.transpose(obs, [0, 2, 3, 1])
+        # Explicit cast to float32 needed in eager.
+        model_out, self._value_out = self.base_model(tf.cast(obs, tf.float32))
+        # Our last layer is already flat.
+        if self.last_layer_is_flattened:
+            return model_out, state
+        # Last layer is a n x [1,1] Conv2D -> Flatten.
+        else:
+            return tf.squeeze(model_out, axis=[1, 2]), state
+
+    def value_function(self) -> TensorType:
+        return tf.reshape(self._value_out, [-1])
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/models/utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b94bbfd1887dfa33cf19fdfe1e1ca48889d20
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/models/utils.py
@@ -0,0 +1,280 @@
+from typing import Callable, Optional, Union
+
+from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.framework import try_import_jax, try_import_tf, try_import_torch
+
+
+@DeveloperAPI
+def get_activation_fn(
+    name: Optional[Union[Callable, str]] = None,
+    framework: str = "tf",
+):
+    """Returns a framework specific activation function, given a name string.
+
+    Args:
+        name: One of "relu" (default), "tanh", "elu",
+            "swish" (or "silu", which is the same), or "linear" (same as None).
+        framework: One of "jax", "tf|tf2" or "torch".
+
+    Returns:
+        A framework-specific activtion function. e.g. tf.nn.tanh or
+            torch.nn.ReLU. None if name in ["linear", None].
+
+    Raises:
+        ValueError: If name is an unknown activation function.
+    """
+    # Already a callable, return as-is.
+    if callable(name):
+        return name
+
+    name_lower = name.lower() if isinstance(name, str) else name
+
+    # Infer the correct activation function from the string specifier.
+    if framework == "torch":
+        if name_lower in ["linear", None]:
+            return None
+
+        _, nn = try_import_torch()
+        # First try getting the correct activation function from nn directly.
+        # Note that torch activation functions are not all lower case.
+        fn = getattr(nn, name, None)
+        if fn is not None:
+            return fn
+
+        if name_lower in ["swish", "silu"]:
+            return nn.SiLU
+        elif name_lower == "relu":
+            return nn.ReLU
+        elif name_lower == "tanh":
+            return nn.Tanh
+        elif name_lower == "elu":
+            return nn.ELU
+    elif framework == "jax":
+        if name_lower in ["linear", None]:
+            return None
+        jax, _ = try_import_jax()
+        if name_lower in ["swish", "silu"]:
+            return jax.nn.swish
+        if name_lower == "relu":
+            return jax.nn.relu
+        elif name_lower == "tanh":
+            return jax.nn.hard_tanh
+        elif name_lower == "elu":
+            return jax.nn.elu
+    else:
+        assert framework in ["tf", "tf2"], "Unsupported framework `{}`!".format(
+            framework
+        )
+        if name_lower in ["linear", None]:
+            return None
+
+        tf1, tf, tfv = try_import_tf()
+        # Try getting the correct activation function from tf.nn directly.
+        # Note that tf activation functions are all lower case, so this should always
+        # work.
+        fn = getattr(tf.nn, name_lower, None)
+
+        if fn is not None:
+            return fn
+
+    raise ValueError(
+        "Unknown activation ({}) for framework={}!".format(name, framework)
+    )
+
+
+@DeveloperAPI
+def get_initializer_fn(name: Optional[Union[str, Callable]], framework: str = "torch"):
+    """Returns the framework-specific initializer class or function.
+
+    This function relies fully on the specified initializer classes and
+    functions in the frameworks `torch` and `tf2` (see for `torch`
+    https://pytorch.org/docs/stable/nn.init.html and for `tf2` see
+    https://www.tensorflow.org/api_docs/python/tf/keras/initializers).
+
+    Note, for framework `torch` the in-place initializers are needed, i.e. names
+    should end with an underscore `_`, e.g. `glorot_uniform_`.
+
+    Args:
+        name: Name of the initializer class or function in one of the two
+            supported frameworks, i.e. `torch` or `tf2`.
+        framework: The framework string, either `torch  or `tf2`.
+
+    Returns:
+        A framework-specific function or class defining an initializer to be used
+        for network initialization,
+
+    Raises:
+        `ValueError` if the `name` is neither class or function in the specified
+        `framework`. Raises also a `ValueError`, if `name` does not define an
+        in-place initializer for framework `torch`.
+    """
+    # Already a callable or `None` return as is. If `None` we use the default
+    # initializer defined in the framework-specific layers themselves.
+    if callable(name) or name is None:
+        return name
+
+    if framework == "torch":
+        name_lower = name.lower() if isinstance(name, str) else name
+
+        _, nn = try_import_torch()
+
+        # Check, if the name includes an underscore. We must use the
+        # in-place initialization from Torch.
+        if not name_lower.endswith("_"):
+            raise ValueError(
+                "Not an in-place initializer: Torch weight initializers "
+                "need to be provided as their in-place version, i.e. "
+                "<initializaer_name> + '_'. See "
+                "https://pytorch.org/docs/stable/nn.init.html. "
+                f"User provided {name}."
+            )
+
+        # First, try to get the initialization directly from `nn.init`.
+        # Note, that all initialization methods in `nn.init` are lower
+        # case and that `<method>_` defines the "in-place" method.
+        fn = getattr(nn.init, name_lower, None)
+        if fn is not None:
+            # TODO (simon): Raise a warning if not "in-place" method.
+            return fn
+        # Unknown initializer.
+        else:
+            # Inform the user that this initializer does not exist.
+            raise ValueError(
+                f"Unknown initializer name: {name_lower} is not a method in "
+                "`torch.nn.init`!"
+            )
+    elif framework == "tf2":
+        # Note, as initializer classes in TensorFlow can be either given by their
+        # name in camel toe typing or by their shortcut we use the `name` as it is.
+        # See https://www.tensorflow.org/api_docs/python/tf/keras/initializers.
+
+        _, tf, _ = try_import_tf()
+
+        # Try to get the initialization function directly from `tf.keras.initializers`.
+        fn = getattr(tf.keras.initializers, name, None)
+        if fn is not None:
+            return fn
+        # Unknown initializer.
+        else:
+            # Inform the user that this initializer does not exist.
+            raise ValueError(
+                f"Unknown initializer: {name} is not a initializer in "
+                "`tf.keras.initializers`!"
+            )
+
+
+@DeveloperAPI
+def get_filter_config(shape):
+    """Returns a default Conv2D filter config (list) for a given image shape.
+
+    Args:
+        shape (Tuple[int]): The input (image) shape, e.g. (84,84,3).
+
+    Returns:
+        List[list]: The Conv2D filter configuration usable as `conv_filters`
+            inside a model config dict.
+    """
+    # 96x96x3 (e.g. CarRacing-v0).
+    filters_96x96 = [
+        [16, [8, 8], 4],
+        [32, [4, 4], 2],
+        [256, [11, 11], 2],
+    ]
+    # Atari.
+    filters_84x84 = [
+        [16, [8, 8], 4],
+        [32, [4, 4], 2],
+        [256, [11, 11], 1],
+    ]
+    # Dreamer-style (S-sized model) Atari or DM Control Suite.
+    filters_64x64 = [
+        [32, [4, 4], 2],
+        [64, [4, 4], 2],
+        [128, [4, 4], 2],
+        [256, [4, 4], 2],
+    ]
+    # Small (1/2) Atari.
+    filters_42x42 = [
+        [16, [4, 4], 2],
+        [32, [4, 4], 2],
+        [256, [11, 11], 1],
+    ]
+    # Test image (10x10).
+    filters_10x10 = [
+        [16, [5, 5], 2],
+        [32, [5, 5], 2],
+    ]
+
+    shape = list(shape)
+    if len(shape) in [2, 3] and (shape[:2] == [96, 96] or shape[1:] == [96, 96]):
+        return filters_96x96
+    elif len(shape) in [2, 3] and (shape[:2] == [84, 84] or shape[1:] == [84, 84]):
+        return filters_84x84
+    elif len(shape) in [2, 3] and (shape[:2] == [64, 64] or shape[1:] == [64, 64]):
+        return filters_64x64
+    elif len(shape) in [2, 3] and (shape[:2] == [42, 42] or shape[1:] == [42, 42]):
+        return filters_42x42
+    elif len(shape) in [2, 3] and (shape[:2] == [10, 10] or shape[1:] == [10, 10]):
+        return filters_10x10
+    else:
+        raise ValueError(
+            "No default configuration for obs shape {}".format(shape)
+            + ", you must specify `conv_filters` manually as a model option. "
+            "Default configurations are only available for inputs of the following "
+            "shapes: [42, 42, K], [84, 84, K], [64, 64, K], [10, 10, K]. You may "
+            "alternatively want to use a custom model or preprocessor."
+        )
+
+
+@DeveloperAPI
+def get_initializer(name, framework="tf"):
+    """Returns a framework specific initializer, given a name string.
+
+    Args:
+        name: One of "xavier_uniform" (default), "xavier_normal".
+        framework: One of "jax", "tf|tf2" or "torch".
+
+    Returns:
+        A framework-specific initializer function, e.g.
+            tf.keras.initializers.GlorotUniform or
+            torch.nn.init.xavier_uniform_.
+
+    Raises:
+        ValueError: If name is an unknown initializer.
+    """
+    # Already a callable, return as-is.
+    if callable(name):
+        return name
+
+    if framework == "jax":
+        _, flax = try_import_jax()
+        assert flax is not None, "`flax` not installed. Try `pip install jax flax`."
+        import flax.linen as nn
+
+        if name in [None, "default", "xavier_uniform"]:
+            return nn.initializers.xavier_uniform()
+        elif name == "xavier_normal":
+            return nn.initializers.xavier_normal()
+    if framework == "torch":
+        _, nn = try_import_torch()
+        assert nn is not None, "`torch` not installed. Try `pip install torch`."
+        if name in [None, "default", "xavier_uniform"]:
+            return nn.init.xavier_uniform_
+        elif name == "xavier_normal":
+            return nn.init.xavier_normal_
+    else:
+        assert framework in ["tf", "tf2"], "Unsupported framework `{}`!".format(
+            framework
+        )
+        tf1, tf, tfv = try_import_tf()
+        assert (
+            tf is not None
+        ), "`tensorflow` not installed. Try `pip install tensorflow`."
+        if name in [None, "default", "xavier_uniform"]:
+            return tf.keras.initializers.GlorotUniform
+        elif name == "xavier_normal":
+            return tf.keras.initializers.GlorotNormal
+
+    raise ValueError(
+        "Unknown activation ({}) for framework={}!".format(name, framework)
+    )