diff --git a/.gitattributes b/.gitattributes
index 2b04037090d13ae8fad315e9702817de175b1be2..7c997f5173693385cf8f34df08526d910db5986d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -175,3 +175,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..585aec6750dec82942d931457abb7d9446298822
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebb34d8a5e73fa6657fb50dde3c5afc10ca55bef89431f9fbe15555295f4da0e
+size 168124
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe4eb9b9a10fb5482dce8bbb4b89589207cbc84f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__init__.py
@@ -0,0 +1,12 @@
+from ray.rllib.algorithms.appo.appo import APPO, APPOConfig
+from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy, APPOTF2Policy
+from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy
+
+__all__ = [
+    "APPO",
+    "APPOConfig",
+    # @OldAPIStack
+    "APPOTF1Policy",
+    "APPOTF2Policy",
+    "APPOTorchPolicy",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..faf324e310c7da70cc66b3a38077b9bd6f131a15
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3373ddf001b94947662a300c4208872d70759ffd
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..361a97b1a678da06572e9364d3f990254b478edc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_rl_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1dd9017244d2b9b8479958a20f19cb34841bcca7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_rl_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_tf_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_tf_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d97032d0c7ce61a2f7ebb822576f6cd40fbe4ff0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_tf_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_torch_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_torch_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cad22493c3280c8028cc6b4338d31a478d478ff
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_torch_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/default_appo_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/default_appo_rl_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22ec0c8b6fe65d136fb8d459ac3929905a8c4f20
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/default_appo_rl_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..387a216f2e081375a2096e5b1cbf1183cce7e04e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.py
new file mode 100644
index 0000000000000000000000000000000000000000..68267b876637a09c4e116a726e9f85ea0a049e8c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.py
@@ -0,0 +1,434 @@
+"""Asynchronous Proximal Policy Optimization (APPO)
+
+The algorithm is described in [1] (under the name of "IMPACT"):
+
+Detailed documentation:
+https://docs.ray.io/en/master/rllib-algorithms.html#appo
+
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
+"""
+
+from typing import Optional, Type
+import logging
+
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.algorithms.impala.impala import IMPALA, IMPALAConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.policy.policy import Policy
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning
+from ray.rllib.utils.metrics import (
+    LAST_TARGET_UPDATE_TS,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_TARGET_UPDATES,
+)
+from ray.rllib.utils.metrics import LEARNER_STATS_KEY
+
+logger = logging.getLogger(__name__)
+
+
+LEARNER_RESULTS_KL_KEY = "mean_kl_loss"
+LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff"
+OLD_ACTION_DIST_KEY = "old_action_dist"
+
+
+class APPOConfig(IMPALAConfig):
+    """Defines a configuration class from which an APPO Algorithm can be built.
+
+    .. testcode::
+
+        from ray.rllib.algorithms.appo import APPOConfig
+        config = (
+            APPOConfig()
+            .training(lr=0.01, grad_clip=30.0, train_batch_size_per_learner=50)
+        )
+        config = config.learners(num_learners=1)
+        config = config.env_runners(num_env_runners=1)
+        config = config.environment("CartPole-v1")
+
+        # Build an Algorithm object from the config and run 1 training iteration.
+        algo = config.build()
+        algo.train()
+        del algo
+
+    .. testcode::
+
+        from ray.rllib.algorithms.appo import APPOConfig
+        from ray import air
+        from ray import tune
+
+        config = APPOConfig()
+        # Update the config object.
+        config = config.training(lr=tune.grid_search([0.001,]))
+        # Set the config object's env.
+        config = config.environment(env="CartPole-v1")
+        # Use to_dict() to get the old-style python config dict when running with tune.
+        tune.Tuner(
+            "APPO",
+            run_config=air.RunConfig(
+                stop={"training_iteration": 1},
+                verbose=0,
+            ),
+            param_space=config.to_dict(),
+
+        ).fit()
+
+    .. testoutput::
+        :hide:
+
+        ...
+    """
+
+    def __init__(self, algo_class=None):
+        """Initializes a APPOConfig instance."""
+        self.exploration_config = {
+            # The Exploration class to use. In the simplest case, this is the name
+            # (str) of any class present in the `rllib.utils.exploration` package.
+            # You can also provide the python class directly or the full location
+            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
+            # EpsilonGreedy").
+            "type": "StochasticSampling",
+            # Add constructor kwargs here (if any).
+        }
+
+        super().__init__(algo_class=algo_class or APPO)
+
+        # fmt: off
+        # __sphinx_doc_begin__
+        # APPO specific settings:
+        self.vtrace = True
+        self.use_gae = True
+        self.lambda_ = 1.0
+        self.clip_param = 0.4
+        self.use_kl_loss = False
+        self.kl_coeff = 1.0
+        self.kl_target = 0.01
+        self.target_worker_clipping = 2.0
+
+        # Circular replay buffer settings.
+        # Used in [1] for discrete action tasks:
+        # `circular_buffer_num_batches=4` and `circular_buffer_iterations_per_batch=2`
+        # For cont. action tasks:
+        # `circular_buffer_num_batches=16` and `circular_buffer_iterations_per_batch=20`
+        self.circular_buffer_num_batches = 4
+        self.circular_buffer_iterations_per_batch = 2
+
+        # Override some of IMPALAConfig's default values with APPO-specific values.
+        self.num_env_runners = 2
+        self.target_network_update_freq = 2
+        self.broadcast_interval = 1
+        self.grad_clip = 40.0
+        # Note: Only when using enable_rl_module_and_learner=True can the clipping mode
+        # be configured by the user. On the old API stack, RLlib will always clip by
+        # global_norm, no matter the value of `grad_clip_by`.
+        self.grad_clip_by = "global_norm"
+
+        self.opt_type = "adam"
+        self.lr = 0.0005
+        self.decay = 0.99
+        self.momentum = 0.0
+        self.epsilon = 0.1
+        self.vf_loss_coeff = 0.5
+        self.entropy_coeff = 0.01
+        self.tau = 1.0
+        # __sphinx_doc_end__
+        # fmt: on
+
+        self.lr_schedule = None  # @OldAPIStack
+        self.entropy_coeff_schedule = None  # @OldAPIStack
+        self.num_gpus = 0  # @OldAPIStack
+        self.num_multi_gpu_tower_stacks = 1  # @OldAPIStack
+        self.minibatch_buffer_size = 1  # @OldAPIStack
+        self.replay_proportion = 0.0  # @OldAPIStack
+        self.replay_buffer_num_slots = 100  # @OldAPIStack
+        self.learner_queue_size = 16  # @OldAPIStack
+        self.learner_queue_timeout = 300  # @OldAPIStack
+
+        # Deprecated keys.
+        self.target_update_frequency = DEPRECATED_VALUE
+        self.use_critic = DEPRECATED_VALUE
+
+    @override(IMPALAConfig)
+    def training(
+        self,
+        *,
+        vtrace: Optional[bool] = NotProvided,
+        use_gae: Optional[bool] = NotProvided,
+        lambda_: Optional[float] = NotProvided,
+        clip_param: Optional[float] = NotProvided,
+        use_kl_loss: Optional[bool] = NotProvided,
+        kl_coeff: Optional[float] = NotProvided,
+        kl_target: Optional[float] = NotProvided,
+        target_network_update_freq: Optional[int] = NotProvided,
+        tau: Optional[float] = NotProvided,
+        target_worker_clipping: Optional[float] = NotProvided,
+        circular_buffer_num_batches: Optional[int] = NotProvided,
+        circular_buffer_iterations_per_batch: Optional[int] = NotProvided,
+        # Deprecated keys.
+        target_update_frequency=DEPRECATED_VALUE,
+        use_critic=DEPRECATED_VALUE,
+        **kwargs,
+    ) -> "APPOConfig":
+        """Sets the training related configuration.
+
+        Args:
+            vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
+                advantages will be used instead.
+            use_gae: If true, use the Generalized Advantage Estimator (GAE)
+                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
+                Only applies if vtrace=False.
+            lambda_: GAE (lambda) parameter.
+            clip_param: PPO surrogate slipping parameter.
+            use_kl_loss: Whether to use the KL-term in the loss function.
+            kl_coeff: Coefficient for weighting the KL-loss term.
+            kl_target: Target term for the KL-term to reach (via adjusting the
+                `kl_coeff` automatically).
+            target_network_update_freq: NOTE: This parameter is only applicable on
+                the new API stack. The frequency with which to update the target
+                policy network from the main trained policy network. The metric
+                used is `NUM_ENV_STEPS_TRAINED_LIFETIME` and the unit is `n` (see [1]
+                4.1.1), where: `n = [circular_buffer_num_batches (N)] *
+                [circular_buffer_iterations_per_batch (K)] * [train batch size]`
+                For example, if you set `target_network_update_freq=2`, and N=4, K=2,
+                and `train_batch_size_per_learner=500`, then the target net is updated
+                every 2*4*2*500=8000 trained env steps (every 16 batch updates on each
+                learner).
+                The authors in [1] suggests that this setting is robust to a range of
+                choices (try values between 0.125 and 4).
+            target_network_update_freq: The frequency to update the target policy and
+                tune the kl loss coefficients that are used during training. After
+                setting this parameter, the algorithm waits for at least
+                `target_network_update_freq` number of environment samples to be trained
+                on before updating the target networks and tune the kl loss
+                coefficients. NOTE: This parameter is only applicable when using the
+                Learner API (enable_rl_module_and_learner=True).
+            tau: The factor by which to update the target policy network towards
+                the current policy network. Can range between 0 and 1.
+                e.g. updated_param = tau * current_param + (1 - tau) * target_param
+            target_worker_clipping: The maximum value for the target-worker-clipping
+                used for computing the IS ratio, described in [1]
+                IS = min(π(i) / π(target), ρ) * (π / π(i))
+            circular_buffer_num_batches: The number of train batches that fit
+                into the circular buffer. Each such train batch can be sampled for
+                training max. `circular_buffer_iterations_per_batch` times.
+            circular_buffer_iterations_per_batch: The number of times any train
+                batch in the circular buffer can be sampled for training. A batch gets
+                evicted from the buffer either if it's the oldest batch in the buffer
+                and a new batch is added OR if the batch reaches this max. number of
+                being sampled.
+
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        if target_update_frequency != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="target_update_frequency",
+                new="target_network_update_freq",
+                error=True,
+            )
+        if use_critic != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="use_critic",
+                help="`use_critic` no longer supported! APPO always uses a value "
+                "function (critic).",
+                error=True,
+            )
+
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+
+        if vtrace is not NotProvided:
+            self.vtrace = vtrace
+        if use_gae is not NotProvided:
+            self.use_gae = use_gae
+        if lambda_ is not NotProvided:
+            self.lambda_ = lambda_
+        if clip_param is not NotProvided:
+            self.clip_param = clip_param
+        if use_kl_loss is not NotProvided:
+            self.use_kl_loss = use_kl_loss
+        if kl_coeff is not NotProvided:
+            self.kl_coeff = kl_coeff
+        if kl_target is not NotProvided:
+            self.kl_target = kl_target
+        if target_network_update_freq is not NotProvided:
+            self.target_network_update_freq = target_network_update_freq
+        if tau is not NotProvided:
+            self.tau = tau
+        if target_worker_clipping is not NotProvided:
+            self.target_worker_clipping = target_worker_clipping
+        if circular_buffer_num_batches is not NotProvided:
+            self.circular_buffer_num_batches = circular_buffer_num_batches
+        if circular_buffer_iterations_per_batch is not NotProvided:
+            self.circular_buffer_iterations_per_batch = (
+                circular_buffer_iterations_per_batch
+            )
+
+        return self
+
+    @override(IMPALAConfig)
+    def validate(self) -> None:
+        super().validate()
+
+        # On new API stack, circular buffer should be used, not `minibatch_buffer_size`.
+        if self.enable_rl_module_and_learner:
+            if self.minibatch_buffer_size != 1 or self.replay_proportion != 0.0:
+                self._value_error(
+                    "`minibatch_buffer_size/replay_proportion` not valid on new API "
+                    "stack with APPO! "
+                    "Use `circular_buffer_num_batches` for the number of train batches "
+                    "in the circular buffer. To change the maximum number of times "
+                    "any batch may be sampled, set "
+                    "`circular_buffer_iterations_per_batch`."
+                )
+            if self.num_multi_gpu_tower_stacks != 1:
+                self._value_error(
+                    "`num_multi_gpu_tower_stacks` not supported on new API stack with "
+                    "APPO! In order to train on multi-GPU, use "
+                    "`config.learners(num_learners=[number of GPUs], "
+                    "num_gpus_per_learner=1)`. To scale the throughput of batch-to-GPU-"
+                    "pre-loading on each of your `Learners`, set "
+                    "`num_gpu_loader_threads` to a higher number (recommended values: "
+                    "1-8)."
+                )
+            if self.learner_queue_size != 16:
+                self._value_error(
+                    "`learner_queue_size` not supported on new API stack with "
+                    "APPO! In order set the size of the circular buffer (which acts as "
+                    "a 'learner queue'), use "
+                    "`config.training(circular_buffer_num_batches=..)`. To change the "
+                    "maximum number of times any batch may be sampled, set "
+                    "`config.training(circular_buffer_iterations_per_batch=..)`."
+                )
+
+    @override(IMPALAConfig)
+    def get_default_learner_class(self):
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.appo.torch.appo_torch_learner import (
+                APPOTorchLearner,
+            )
+
+            return APPOTorchLearner
+        elif self.framework_str in ["tf2", "tf"]:
+            raise ValueError(
+                "TensorFlow is no longer supported on the new API stack! "
+                "Use `framework='torch'`."
+            )
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use `framework='torch'`."
+            )
+
+    @override(IMPALAConfig)
+    def get_default_rl_module_spec(self) -> RLModuleSpec:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.appo.torch.appo_torch_rl_module import (
+                APPOTorchRLModule as RLModule,
+            )
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use either 'torch' or 'tf2'."
+            )
+
+        return RLModuleSpec(module_class=RLModule)
+
+    @property
+    @override(AlgorithmConfig)
+    def _model_config_auto_includes(self):
+        return super()._model_config_auto_includes | {"vf_share_layers": False}
+
+
+class APPO(IMPALA):
+    def __init__(self, config, *args, **kwargs):
+        """Initializes an APPO instance."""
+        super().__init__(config, *args, **kwargs)
+
+        # After init: Initialize target net.
+
+        # TODO(avnishn): Does this need to happen in __init__? I think we can move it
+        #  to setup()
+        if not self.config.enable_rl_module_and_learner:
+            self.env_runner.foreach_policy_to_train(lambda p, _: p.update_target())
+
+    @override(IMPALA)
+    def training_step(self) -> None:
+        if self.config.enable_rl_module_and_learner:
+            return super().training_step()
+
+        train_results = super().training_step()
+        # Update the target network and the KL coefficient for the APPO-loss.
+        # The target network update frequency is calculated automatically by the product
+        # of `num_epochs` setting (usually 1 for APPO) and `minibatch_buffer_size`.
+        last_update = self._counters[LAST_TARGET_UPDATE_TS]
+        cur_ts = self._counters[
+            (
+                NUM_AGENT_STEPS_SAMPLED
+                if self.config.count_steps_by == "agent_steps"
+                else NUM_ENV_STEPS_SAMPLED
+            )
+        ]
+        target_update_freq = self.config.num_epochs * self.config.minibatch_buffer_size
+        if cur_ts - last_update > target_update_freq:
+            self._counters[NUM_TARGET_UPDATES] += 1
+            self._counters[LAST_TARGET_UPDATE_TS] = cur_ts
+
+            # Update our target network.
+            self.env_runner.foreach_policy_to_train(lambda p, _: p.update_target())
+
+            # Also update the KL-coefficient for the APPO loss, if necessary.
+            if self.config.use_kl_loss:
+
+                def update(pi, pi_id):
+                    assert LEARNER_STATS_KEY not in train_results, (
+                        "{} should be nested under policy id key".format(
+                            LEARNER_STATS_KEY
+                        ),
+                        train_results,
+                    )
+                    if pi_id in train_results:
+                        kl = train_results[pi_id][LEARNER_STATS_KEY].get("kl")
+                        assert kl is not None, (train_results, pi_id)
+                        # Make the actual `Policy.update_kl()` call.
+                        pi.update_kl(kl)
+                    else:
+                        logger.warning("No data for {}, not updating kl".format(pi_id))
+
+                # Update KL on all trainable policies within the local (trainer)
+                # Worker.
+                self.env_runner.foreach_policy_to_train(update)
+
+        return train_results
+
+    @classmethod
+    @override(IMPALA)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return APPOConfig()
+
+    @classmethod
+    @override(IMPALA)
+    def get_default_policy_class(
+        cls, config: AlgorithmConfig
+    ) -> Optional[Type[Policy]]:
+        if config["framework"] == "torch":
+            from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy
+
+            return APPOTorchPolicy
+        elif config["framework"] == "tf":
+            if config.enable_rl_module_and_learner:
+                raise ValueError(
+                    "RLlib's RLModule and Learner API is not supported for"
+                    " tf1. Use "
+                    "framework='tf2' instead."
+                )
+            from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy
+
+            return APPOTF1Policy
+        else:
+            from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF2Policy
+
+            return APPOTF2Policy
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..235bc823209f8a47c9fe189c0c9cd5b6b4804170
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py
@@ -0,0 +1,147 @@
+import abc
+from typing import Any, Dict, Optional
+
+from ray.rllib.algorithms.appo.appo import APPOConfig
+from ray.rllib.algorithms.appo.utils import CircularBuffer
+from ray.rllib.algorithms.impala.impala_learner import IMPALALearner
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.learner.utils import update_target_network
+from ray.rllib.core.rl_module.apis import TargetNetworkAPI, ValueFunctionAPI
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict
+from ray.rllib.utils.metrics import (
+    LAST_TARGET_UPDATE_TS,
+    NUM_ENV_STEPS_TRAINED_LIFETIME,
+    NUM_MODULE_STEPS_TRAINED,
+    NUM_TARGET_UPDATES,
+)
+from ray.rllib.utils.schedules.scheduler import Scheduler
+from ray.rllib.utils.typing import ModuleID, ShouldModuleBeUpdatedFn
+
+
+class APPOLearner(IMPALALearner):
+    """Adds KL coeff updates via `after_gradient_based_update()` to IMPALA logic.
+
+    Framework-specific subclasses must override `_update_module_kl_coeff()`.
+    """
+
+    @override(IMPALALearner)
+    def build(self):
+        self._learner_thread_in_queue = CircularBuffer(
+            num_batches=self.config.circular_buffer_num_batches,
+            iterations_per_batch=self.config.circular_buffer_iterations_per_batch,
+        )
+
+        super().build()
+
+        # Make target networks.
+        self.module.foreach_module(
+            lambda mid, mod: (
+                mod.make_target_networks()
+                if isinstance(mod, TargetNetworkAPI)
+                else None
+            )
+        )
+
+        # The current kl coefficients per module as (framework specific) tensor
+        # variables.
+        self.curr_kl_coeffs_per_module: LambdaDefaultDict[
+            ModuleID, Scheduler
+        ] = LambdaDefaultDict(
+            lambda module_id: self._get_tensor_variable(
+                self.config.get_config_for_module(module_id).kl_coeff
+            )
+        )
+
+    @override(Learner)
+    def add_module(
+        self,
+        *,
+        module_id: ModuleID,
+        module_spec: RLModuleSpec,
+        config_overrides: Optional[Dict] = None,
+        new_should_module_be_updated: Optional[ShouldModuleBeUpdatedFn] = None,
+    ) -> MultiRLModuleSpec:
+        marl_spec = super().add_module(
+            module_id=module_id,
+            module_spec=module_spec,
+            config_overrides=config_overrides,
+            new_should_module_be_updated=new_should_module_be_updated,
+        )
+        # Create target networks for added Module, if applicable.
+        if isinstance(self.module[module_id].unwrapped(), TargetNetworkAPI):
+            self.module[module_id].unwrapped().make_target_networks()
+        return marl_spec
+
+    @override(IMPALALearner)
+    def remove_module(self, module_id: str) -> MultiRLModuleSpec:
+        marl_spec = super().remove_module(module_id)
+        self.curr_kl_coeffs_per_module.pop(module_id)
+        return marl_spec
+
+    @override(Learner)
+    def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None:
+        """Updates the target Q Networks."""
+        super().after_gradient_based_update(timesteps=timesteps)
+
+        # TODO (sven): Maybe we should have a `after_gradient_based_update`
+        #  method per module?
+        curr_timestep = timesteps.get(NUM_ENV_STEPS_TRAINED_LIFETIME, 0)
+        for module_id, module in self.module._rl_modules.items():
+            config = self.config.get_config_for_module(module_id)
+
+            last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS)
+            if isinstance(module.unwrapped(), TargetNetworkAPI) and (
+                curr_timestep - self.metrics.peek(last_update_ts_key, default=0)
+                >= (
+                    config.target_network_update_freq
+                    * config.circular_buffer_num_batches
+                    * config.circular_buffer_iterations_per_batch
+                    * config.train_batch_size_per_learner
+                )
+            ):
+                for (
+                    main_net,
+                    target_net,
+                ) in module.unwrapped().get_target_network_pairs():
+                    update_target_network(
+                        main_net=main_net,
+                        target_net=target_net,
+                        tau=config.tau,
+                    )
+                # Increase lifetime target network update counter by one.
+                self.metrics.log_value((module_id, NUM_TARGET_UPDATES), 1, reduce="sum")
+                # Update the (single-value -> window=1) last updated timestep metric.
+                self.metrics.log_value(last_update_ts_key, curr_timestep, window=1)
+
+            if (
+                config.use_kl_loss
+                and self.metrics.peek((module_id, NUM_MODULE_STEPS_TRAINED), default=0)
+                > 0
+            ):
+                self._update_module_kl_coeff(module_id=module_id, config=config)
+
+    @classmethod
+    @override(Learner)
+    def rl_module_required_apis(cls) -> list[type]:
+        # In order for a PPOLearner to update an RLModule, it must implement the
+        # following APIs:
+        return [TargetNetworkAPI, ValueFunctionAPI]
+
+    @abc.abstractmethod
+    def _update_module_kl_coeff(self, module_id: ModuleID, config: APPOConfig) -> None:
+        """Dynamically update the KL loss coefficients of each module.
+
+        The update is completed using the mean KL divergence between the action
+        distributions current policy and old policy of each module. That action
+        distribution is computed during the most recent update/call to `compute_loss`.
+
+        Args:
+            module_id: The module whose KL loss coefficient to update.
+            config: The AlgorithmConfig specific to the given `module_id`.
+        """
+
+
+AppoLearner = APPOLearner
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_rl_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a2f59f9f201bb2ca46e6136ec7e3882a30bd0bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_rl_module.py
@@ -0,0 +1,11 @@
+# Backward compat import.
+from ray.rllib.algorithms.appo.default_appo_rl_module import (  # noqa
+    DefaultAPPORLModule as APPORLModule,
+)
+from ray.rllib.utils.deprecation import deprecation_warning
+
+deprecation_warning(
+    old="ray.rllib.algorithms.appo.appo_rl_module.APPORLModule",
+    new="ray.rllib.algorithms.appo.default_appo_rl_module.DefaultAPPORLModule",
+    error=False,
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_tf_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_tf_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..4af36f099df9216e42e7c9cae039e6a12bddef6f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_tf_policy.py
@@ -0,0 +1,393 @@
+"""
+TensorFlow policy class used for APPO.
+
+Adapted from VTraceTFPolicy to use the PPO surrogate loss.
+Keep in sync with changes to VTraceTFPolicy.
+"""
+
+import numpy as np
+import logging
+import gymnasium as gym
+from typing import Dict, List, Optional, Type, Union
+
+from ray.rllib.algorithms.appo.utils import make_appo_models
+from ray.rllib.algorithms.impala import vtrace_tf as vtrace
+from ray.rllib.algorithms.impala.impala_tf_policy import (
+    _make_time_major,
+    VTraceClipGradients,
+    VTraceOptimizer,
+)
+from ray.rllib.evaluation.postprocessing import (
+    compute_bootstrap_value,
+    compute_gae_for_sample_batch,
+    Postprocessing,
+)
+from ray.rllib.models.tf.tf_action_dist import Categorical
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2
+from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
+from ray.rllib.policy.tf_mixins import (
+    EntropyCoeffSchedule,
+    LearningRateSchedule,
+    KLCoeffMixin,
+    ValueNetworkMixin,
+    GradStatsMixin,
+    TargetNetworkMixin,
+)
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_action_dist import TFActionDistribution
+from ray.rllib.utils.annotations import (
+    override,
+)
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.tf_utils import explained_variance
+from ray.rllib.utils.typing import TensorType
+
+tf1, tf, tfv = try_import_tf()
+
+logger = logging.getLogger(__name__)
+
+
+# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
+def get_appo_tf_policy(name: str, base: type) -> type:
+    """Construct an APPOTFPolicy inheriting either dynamic or eager base policies.
+
+    Args:
+        base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2.
+
+    Returns:
+        A TF Policy to be used with Impala.
+    """
+
+    class APPOTFPolicy(
+        VTraceClipGradients,
+        VTraceOptimizer,
+        LearningRateSchedule,
+        KLCoeffMixin,
+        EntropyCoeffSchedule,
+        ValueNetworkMixin,
+        TargetNetworkMixin,
+        GradStatsMixin,
+        base,
+    ):
+        def __init__(
+            self,
+            observation_space,
+            action_space,
+            config,
+            existing_model=None,
+            existing_inputs=None,
+        ):
+            # First thing first, enable eager execution if necessary.
+            base.enable_eager_execution_if_necessary()
+
+            # Although this is a no-op, we call __init__ here to make it clear
+            # that base.__init__ will use the make_model() call.
+            VTraceClipGradients.__init__(self)
+            VTraceOptimizer.__init__(self)
+
+            # Initialize base class.
+            base.__init__(
+                self,
+                observation_space,
+                action_space,
+                config,
+                existing_inputs=existing_inputs,
+                existing_model=existing_model,
+            )
+
+            # TF LearningRateSchedule depends on self.framework, so initialize
+            # after base.__init__() is called.
+            LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])
+            EntropyCoeffSchedule.__init__(
+                self, config["entropy_coeff"], config["entropy_coeff_schedule"]
+            )
+            ValueNetworkMixin.__init__(self, config)
+            KLCoeffMixin.__init__(self, config)
+
+            GradStatsMixin.__init__(self)
+
+            # Note: this is a bit ugly, but loss and optimizer initialization must
+            # happen after all the MixIns are initialized.
+            self.maybe_initialize_optimizer_and_loss()
+
+            # Initiate TargetNetwork ops after loss initialization.
+            TargetNetworkMixin.__init__(self)
+
+        @override(base)
+        def make_model(self) -> ModelV2:
+            return make_appo_models(self)
+
+        @override(base)
+        def loss(
+            self,
+            model: Union[ModelV2, "tf.keras.Model"],
+            dist_class: Type[TFActionDistribution],
+            train_batch: SampleBatch,
+        ) -> Union[TensorType, List[TensorType]]:
+            model_out, _ = model(train_batch)
+            action_dist = dist_class(model_out, model)
+
+            if isinstance(self.action_space, gym.spaces.Discrete):
+                is_multidiscrete = False
+                output_hidden_shape = [self.action_space.n]
+            elif isinstance(self.action_space, gym.spaces.multi_discrete.MultiDiscrete):
+                is_multidiscrete = True
+                output_hidden_shape = self.action_space.nvec.astype(np.int32)
+            else:
+                is_multidiscrete = False
+                output_hidden_shape = 1
+
+            def make_time_major(*args, **kw):
+                return _make_time_major(
+                    self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw
+                )
+
+            actions = train_batch[SampleBatch.ACTIONS]
+            dones = train_batch[SampleBatch.TERMINATEDS]
+            rewards = train_batch[SampleBatch.REWARDS]
+            behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS]
+
+            target_model_out, _ = self.target_model(train_batch)
+            prev_action_dist = dist_class(behaviour_logits, self.model)
+            values = self.model.value_function()
+            values_time_major = make_time_major(values)
+            bootstrap_values_time_major = make_time_major(
+                train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            )
+            bootstrap_value = bootstrap_values_time_major[-1]
+
+            if self.is_recurrent():
+                max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
+                mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
+                mask = tf.reshape(mask, [-1])
+                mask = make_time_major(mask)
+
+                def reduce_mean_valid(t):
+                    return tf.reduce_mean(tf.boolean_mask(t, mask))
+
+            else:
+                reduce_mean_valid = tf.reduce_mean
+
+            if self.config["vtrace"]:
+                logger.debug("Using V-Trace surrogate loss (vtrace=True)")
+
+                # Prepare actions for loss.
+                loss_actions = (
+                    actions if is_multidiscrete else tf.expand_dims(actions, axis=1)
+                )
+
+                old_policy_behaviour_logits = tf.stop_gradient(target_model_out)
+                old_policy_action_dist = dist_class(old_policy_behaviour_logits, model)
+
+                # Prepare KL for Loss
+                mean_kl = make_time_major(old_policy_action_dist.multi_kl(action_dist))
+
+                unpacked_behaviour_logits = tf.split(
+                    behaviour_logits, output_hidden_shape, axis=1
+                )
+                unpacked_old_policy_behaviour_logits = tf.split(
+                    old_policy_behaviour_logits, output_hidden_shape, axis=1
+                )
+
+                # Compute vtrace on the CPU for better perf.
+                with tf.device("/cpu:0"):
+                    vtrace_returns = vtrace.multi_from_logits(
+                        behaviour_policy_logits=make_time_major(
+                            unpacked_behaviour_logits
+                        ),
+                        target_policy_logits=make_time_major(
+                            unpacked_old_policy_behaviour_logits
+                        ),
+                        actions=tf.unstack(make_time_major(loss_actions), axis=2),
+                        discounts=tf.cast(
+                            ~make_time_major(tf.cast(dones, tf.bool)),
+                            tf.float32,
+                        )
+                        * self.config["gamma"],
+                        rewards=make_time_major(rewards),
+                        values=values_time_major,
+                        bootstrap_value=bootstrap_value,
+                        dist_class=Categorical if is_multidiscrete else dist_class,
+                        model=model,
+                        clip_rho_threshold=tf.cast(
+                            self.config["vtrace_clip_rho_threshold"], tf.float32
+                        ),
+                        clip_pg_rho_threshold=tf.cast(
+                            self.config["vtrace_clip_pg_rho_threshold"], tf.float32
+                        ),
+                    )
+
+                actions_logp = make_time_major(action_dist.logp(actions))
+                prev_actions_logp = make_time_major(prev_action_dist.logp(actions))
+                old_policy_actions_logp = make_time_major(
+                    old_policy_action_dist.logp(actions)
+                )
+
+                is_ratio = tf.clip_by_value(
+                    tf.math.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0
+                )
+                logp_ratio = is_ratio * tf.exp(actions_logp - prev_actions_logp)
+                self._is_ratio = is_ratio
+
+                advantages = vtrace_returns.pg_advantages
+                surrogate_loss = tf.minimum(
+                    advantages * logp_ratio,
+                    advantages
+                    * tf.clip_by_value(
+                        logp_ratio,
+                        1 - self.config["clip_param"],
+                        1 + self.config["clip_param"],
+                    ),
+                )
+
+                action_kl = (
+                    tf.reduce_mean(mean_kl, axis=0) if is_multidiscrete else mean_kl
+                )
+                mean_kl_loss = reduce_mean_valid(action_kl)
+                mean_policy_loss = -reduce_mean_valid(surrogate_loss)
+
+                # The value function loss.
+                value_targets = vtrace_returns.vs
+                delta = values_time_major - value_targets
+                mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
+
+                # The entropy loss.
+                actions_entropy = make_time_major(action_dist.multi_entropy())
+                mean_entropy = reduce_mean_valid(actions_entropy)
+
+            else:
+                logger.debug("Using PPO surrogate loss (vtrace=False)")
+
+                # Prepare KL for Loss
+                mean_kl = make_time_major(prev_action_dist.multi_kl(action_dist))
+
+                logp_ratio = tf.math.exp(
+                    make_time_major(action_dist.logp(actions))
+                    - make_time_major(prev_action_dist.logp(actions))
+                )
+
+                advantages = make_time_major(train_batch[Postprocessing.ADVANTAGES])
+                surrogate_loss = tf.minimum(
+                    advantages * logp_ratio,
+                    advantages
+                    * tf.clip_by_value(
+                        logp_ratio,
+                        1 - self.config["clip_param"],
+                        1 + self.config["clip_param"],
+                    ),
+                )
+
+                action_kl = (
+                    tf.reduce_mean(mean_kl, axis=0) if is_multidiscrete else mean_kl
+                )
+                mean_kl_loss = reduce_mean_valid(action_kl)
+                mean_policy_loss = -reduce_mean_valid(surrogate_loss)
+
+                # The value function loss.
+                value_targets = make_time_major(
+                    train_batch[Postprocessing.VALUE_TARGETS]
+                )
+                delta = values_time_major - value_targets
+                mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
+
+                # The entropy loss.
+                mean_entropy = reduce_mean_valid(
+                    make_time_major(action_dist.multi_entropy())
+                )
+
+            # The summed weighted loss.
+            total_loss = mean_policy_loss - mean_entropy * self.entropy_coeff
+            # Optional KL loss.
+            if self.config["use_kl_loss"]:
+                total_loss += self.kl_coeff * mean_kl_loss
+            # Optional vf loss (or in a separate term due to separate
+            # optimizers/networks).
+            loss_wo_vf = total_loss
+            if not self.config["_separate_vf_optimizer"]:
+                total_loss += mean_vf_loss * self.config["vf_loss_coeff"]
+
+            # Store stats in policy for stats_fn.
+            self._total_loss = total_loss
+            self._loss_wo_vf = loss_wo_vf
+            self._mean_policy_loss = mean_policy_loss
+            # Backward compatibility: Deprecate policy._mean_kl.
+            self._mean_kl_loss = self._mean_kl = mean_kl_loss
+            self._mean_vf_loss = mean_vf_loss
+            self._mean_entropy = mean_entropy
+            self._value_targets = value_targets
+
+            # Return one total loss or two losses: vf vs rest (policy + kl).
+            if self.config["_separate_vf_optimizer"]:
+                return loss_wo_vf, mean_vf_loss
+            else:
+                return total_loss
+
+        @override(base)
+        def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
+            values_batched = _make_time_major(
+                self,
+                train_batch.get(SampleBatch.SEQ_LENS),
+                self.model.value_function(),
+            )
+
+            stats_dict = {
+                "cur_lr": tf.cast(self.cur_lr, tf.float64),
+                "total_loss": self._total_loss,
+                "policy_loss": self._mean_policy_loss,
+                "entropy": self._mean_entropy,
+                "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables()),
+                "vf_loss": self._mean_vf_loss,
+                "vf_explained_var": explained_variance(
+                    tf.reshape(self._value_targets, [-1]),
+                    tf.reshape(values_batched, [-1]),
+                ),
+                "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64),
+            }
+
+            if self.config["vtrace"]:
+                is_stat_mean, is_stat_var = tf.nn.moments(self._is_ratio, [0, 1])
+                stats_dict["mean_IS"] = is_stat_mean
+                stats_dict["var_IS"] = is_stat_var
+
+            if self.config["use_kl_loss"]:
+                stats_dict["kl"] = self._mean_kl_loss
+                stats_dict["KL_Coeff"] = self.kl_coeff
+
+            return stats_dict
+
+        @override(base)
+        def postprocess_trajectory(
+            self,
+            sample_batch: SampleBatch,
+            other_agent_batches: Optional[SampleBatch] = None,
+            episode=None,
+        ):
+            # Call super's postprocess_trajectory first.
+            # sample_batch = super().postprocess_trajectory(
+            #    sample_batch, other_agent_batches, episode
+            # )
+
+            if not self.config["vtrace"]:
+                sample_batch = compute_gae_for_sample_batch(
+                    self, sample_batch, other_agent_batches, episode
+                )
+            else:
+                # Add the Columns.VALUES_BOOTSTRAPPED column, which we'll need
+                # inside the loss for vtrace calculations.
+                sample_batch = compute_bootstrap_value(sample_batch, self)
+
+            return sample_batch
+
+        @override(base)
+        def get_batch_divisibility_req(self) -> int:
+            return self.config["rollout_fragment_length"]
+
+    APPOTFPolicy.__name__ = name
+    APPOTFPolicy.__qualname__ = name
+
+    return APPOTFPolicy
+
+
+APPOTF1Policy = get_appo_tf_policy("APPOTF1Policy", DynamicTFPolicyV2)
+APPOTF2Policy = get_appo_tf_policy("APPOTF2Policy", EagerTFPolicyV2)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_torch_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_torch_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d28138c8c25d066fd3f701fae8c5493c2cc55d9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_torch_policy.py
@@ -0,0 +1,412 @@
+"""
+PyTorch policy class used for APPO.
+
+Adapted from VTraceTFPolicy to use the PPO surrogate loss.
+Keep in sync with changes to VTraceTFPolicy.
+"""
+
+import gymnasium as gym
+import numpy as np
+import logging
+from typing import Any, Dict, List, Optional, Type, Union
+
+import ray
+from ray.rllib.algorithms.appo.utils import make_appo_models
+import ray.rllib.algorithms.impala.vtrace_torch as vtrace
+from ray.rllib.algorithms.impala.impala_torch_policy import (
+    make_time_major,
+    VTraceOptimizer,
+)
+from ray.rllib.evaluation.postprocessing import (
+    compute_bootstrap_value,
+    compute_gae_for_sample_batch,
+    Postprocessing,
+)
+from ray.rllib.models.action_dist import ActionDistribution
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.torch.torch_action_dist import (
+    TorchDistributionWrapper,
+    TorchCategorical,
+)
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_mixins import (
+    EntropyCoeffSchedule,
+    LearningRateSchedule,
+    KLCoeffMixin,
+    ValueNetworkMixin,
+    TargetNetworkMixin,
+)
+from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.torch_utils import (
+    apply_grad_clipping,
+    explained_variance,
+    global_norm,
+    sequence_mask,
+)
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+logger = logging.getLogger(__name__)
+
+
+# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
+class APPOTorchPolicy(
+    VTraceOptimizer,
+    LearningRateSchedule,
+    EntropyCoeffSchedule,
+    KLCoeffMixin,
+    ValueNetworkMixin,
+    TargetNetworkMixin,
+    TorchPolicyV2,
+):
+    """PyTorch policy class used with APPO."""
+
+    def __init__(self, observation_space, action_space, config):
+        config = dict(ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config)
+        config["enable_rl_module_and_learner"] = False
+        config["enable_env_runner_and_connector_v2"] = False
+
+        # Although this is a no-op, we call __init__ here to make it clear
+        # that base.__init__ will use the make_model() call.
+        VTraceOptimizer.__init__(self)
+
+        lr_schedule_additional_args = []
+        if config.get("_separate_vf_optimizer"):
+            lr_schedule_additional_args = (
+                [config["_lr_vf"][0][1], config["_lr_vf"]]
+                if isinstance(config["_lr_vf"], (list, tuple))
+                else [config["_lr_vf"], None]
+            )
+        LearningRateSchedule.__init__(
+            self, config["lr"], config["lr_schedule"], *lr_schedule_additional_args
+        )
+
+        TorchPolicyV2.__init__(
+            self,
+            observation_space,
+            action_space,
+            config,
+            max_seq_len=config["model"]["max_seq_len"],
+        )
+
+        EntropyCoeffSchedule.__init__(
+            self, config["entropy_coeff"], config["entropy_coeff_schedule"]
+        )
+        ValueNetworkMixin.__init__(self, config)
+        KLCoeffMixin.__init__(self, config)
+
+        self._initialize_loss_from_dummy_batch()
+
+        # Initiate TargetNetwork ops after loss initialization.
+        TargetNetworkMixin.__init__(self)
+
+    @override(TorchPolicyV2)
+    def init_view_requirements(self):
+        self.view_requirements = self._get_default_view_requirements()
+
+    @override(TorchPolicyV2)
+    def make_model(self) -> ModelV2:
+        return make_appo_models(self)
+
+    @override(TorchPolicyV2)
+    def loss(
+        self,
+        model: ModelV2,
+        dist_class: Type[ActionDistribution],
+        train_batch: SampleBatch,
+    ) -> Union[TensorType, List[TensorType]]:
+        """Constructs the loss for APPO.
+
+        With IS modifications and V-trace for Advantage Estimation.
+
+        Args:
+            model (ModelV2): The Model to calculate the loss for.
+            dist_class (Type[ActionDistribution]): The action distr. class.
+            train_batch: The training data.
+
+        Returns:
+            Union[TensorType, List[TensorType]]: A single loss tensor or a list
+                of loss tensors.
+        """
+        target_model = self.target_models[model]
+
+        model_out, _ = model(train_batch)
+        action_dist = dist_class(model_out, model)
+
+        if isinstance(self.action_space, gym.spaces.Discrete):
+            is_multidiscrete = False
+            output_hidden_shape = [self.action_space.n]
+        elif isinstance(self.action_space, gym.spaces.multi_discrete.MultiDiscrete):
+            is_multidiscrete = True
+            output_hidden_shape = self.action_space.nvec.astype(np.int32)
+        else:
+            is_multidiscrete = False
+            output_hidden_shape = 1
+
+        def _make_time_major(*args, **kwargs):
+            return make_time_major(
+                self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kwargs
+            )
+
+        actions = train_batch[SampleBatch.ACTIONS]
+        dones = train_batch[SampleBatch.TERMINATEDS]
+        rewards = train_batch[SampleBatch.REWARDS]
+        behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS]
+
+        target_model_out, _ = target_model(train_batch)
+
+        prev_action_dist = dist_class(behaviour_logits, model)
+        values = model.value_function()
+        values_time_major = _make_time_major(values)
+        bootstrap_values_time_major = _make_time_major(
+            train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
+        bootstrap_value = bootstrap_values_time_major[-1]
+
+        if self.is_recurrent():
+            max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
+            mask = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
+            mask = torch.reshape(mask, [-1])
+            mask = _make_time_major(mask)
+            num_valid = torch.sum(mask)
+
+            def reduce_mean_valid(t):
+                return torch.sum(t[mask]) / num_valid
+
+        else:
+            reduce_mean_valid = torch.mean
+
+        if self.config["vtrace"]:
+            logger.debug("Using V-Trace surrogate loss (vtrace=True)")
+
+            old_policy_behaviour_logits = target_model_out.detach()
+            old_policy_action_dist = dist_class(old_policy_behaviour_logits, model)
+
+            if isinstance(output_hidden_shape, (list, tuple, np.ndarray)):
+                unpacked_behaviour_logits = torch.split(
+                    behaviour_logits, list(output_hidden_shape), dim=1
+                )
+                unpacked_old_policy_behaviour_logits = torch.split(
+                    old_policy_behaviour_logits, list(output_hidden_shape), dim=1
+                )
+            else:
+                unpacked_behaviour_logits = torch.chunk(
+                    behaviour_logits, output_hidden_shape, dim=1
+                )
+                unpacked_old_policy_behaviour_logits = torch.chunk(
+                    old_policy_behaviour_logits, output_hidden_shape, dim=1
+                )
+
+            # Prepare actions for loss.
+            loss_actions = (
+                actions if is_multidiscrete else torch.unsqueeze(actions, dim=1)
+            )
+
+            # Prepare KL for loss.
+            action_kl = _make_time_major(old_policy_action_dist.kl(action_dist))
+
+            # Compute vtrace on the CPU for better perf.
+            vtrace_returns = vtrace.multi_from_logits(
+                behaviour_policy_logits=_make_time_major(unpacked_behaviour_logits),
+                target_policy_logits=_make_time_major(
+                    unpacked_old_policy_behaviour_logits
+                ),
+                actions=torch.unbind(_make_time_major(loss_actions), dim=2),
+                discounts=(1.0 - _make_time_major(dones).float())
+                * self.config["gamma"],
+                rewards=_make_time_major(rewards),
+                values=values_time_major,
+                bootstrap_value=bootstrap_value,
+                dist_class=TorchCategorical if is_multidiscrete else dist_class,
+                model=model,
+                clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
+                clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"],
+            )
+
+            actions_logp = _make_time_major(action_dist.logp(actions))
+            prev_actions_logp = _make_time_major(prev_action_dist.logp(actions))
+            old_policy_actions_logp = _make_time_major(
+                old_policy_action_dist.logp(actions)
+            )
+            is_ratio = torch.clamp(
+                torch.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0
+            )
+            logp_ratio = is_ratio * torch.exp(actions_logp - prev_actions_logp)
+            self._is_ratio = is_ratio
+
+            advantages = vtrace_returns.pg_advantages.to(logp_ratio.device)
+            surrogate_loss = torch.min(
+                advantages * logp_ratio,
+                advantages
+                * torch.clamp(
+                    logp_ratio,
+                    1 - self.config["clip_param"],
+                    1 + self.config["clip_param"],
+                ),
+            )
+
+            mean_kl_loss = reduce_mean_valid(action_kl)
+            mean_policy_loss = -reduce_mean_valid(surrogate_loss)
+
+            # The value function loss.
+            value_targets = vtrace_returns.vs.to(values_time_major.device)
+            delta = values_time_major - value_targets
+            mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
+
+            # The entropy loss.
+            mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy()))
+
+        else:
+            logger.debug("Using PPO surrogate loss (vtrace=False)")
+
+            # Prepare KL for Loss
+            action_kl = _make_time_major(prev_action_dist.kl(action_dist))
+
+            actions_logp = _make_time_major(action_dist.logp(actions))
+            prev_actions_logp = _make_time_major(prev_action_dist.logp(actions))
+            logp_ratio = torch.exp(actions_logp - prev_actions_logp)
+
+            advantages = _make_time_major(train_batch[Postprocessing.ADVANTAGES])
+            surrogate_loss = torch.min(
+                advantages * logp_ratio,
+                advantages
+                * torch.clamp(
+                    logp_ratio,
+                    1 - self.config["clip_param"],
+                    1 + self.config["clip_param"],
+                ),
+            )
+
+            mean_kl_loss = reduce_mean_valid(action_kl)
+            mean_policy_loss = -reduce_mean_valid(surrogate_loss)
+
+            # The value function loss.
+            value_targets = _make_time_major(train_batch[Postprocessing.VALUE_TARGETS])
+            delta = values_time_major - value_targets
+            mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
+
+            # The entropy loss.
+            mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy()))
+
+        # The summed weighted loss.
+        total_loss = mean_policy_loss - mean_entropy * self.entropy_coeff
+        # Optional additional KL Loss
+        if self.config["use_kl_loss"]:
+            total_loss += self.kl_coeff * mean_kl_loss
+
+        # Optional vf loss (or in a separate term due to separate
+        # optimizers/networks).
+        loss_wo_vf = total_loss
+        if not self.config["_separate_vf_optimizer"]:
+            total_loss += mean_vf_loss * self.config["vf_loss_coeff"]
+
+        # Store values for stats function in model (tower), such that for
+        # multi-GPU, we do not override them during the parallel loss phase.
+        model.tower_stats["total_loss"] = total_loss
+        model.tower_stats["mean_policy_loss"] = mean_policy_loss
+        model.tower_stats["mean_kl_loss"] = mean_kl_loss
+        model.tower_stats["mean_vf_loss"] = mean_vf_loss
+        model.tower_stats["mean_entropy"] = mean_entropy
+        model.tower_stats["value_targets"] = value_targets
+        model.tower_stats["vf_explained_var"] = explained_variance(
+            torch.reshape(value_targets, [-1]),
+            torch.reshape(values_time_major, [-1]),
+        )
+
+        # Return one total loss or two losses: vf vs rest (policy + kl).
+        if self.config["_separate_vf_optimizer"]:
+            return loss_wo_vf, mean_vf_loss
+        else:
+            return total_loss
+
+    @override(TorchPolicyV2)
+    def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
+        """Stats function for APPO. Returns a dict with important loss stats.
+
+        Args:
+            policy: The Policy to generate stats for.
+            train_batch: The SampleBatch (already) used for training.
+
+        Returns:
+            Dict[str, TensorType]: The stats dict.
+        """
+        stats_dict = {
+            "cur_lr": self.cur_lr,
+            "total_loss": torch.mean(torch.stack(self.get_tower_stats("total_loss"))),
+            "policy_loss": torch.mean(
+                torch.stack(self.get_tower_stats("mean_policy_loss"))
+            ),
+            "entropy": torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))),
+            "entropy_coeff": self.entropy_coeff,
+            "var_gnorm": global_norm(self.model.trainable_variables()),
+            "vf_loss": torch.mean(torch.stack(self.get_tower_stats("mean_vf_loss"))),
+            "vf_explained_var": torch.mean(
+                torch.stack(self.get_tower_stats("vf_explained_var"))
+            ),
+        }
+
+        if self.config["vtrace"]:
+            is_stat_mean = torch.mean(self._is_ratio, [0, 1])
+            is_stat_var = torch.var(self._is_ratio, [0, 1])
+            stats_dict["mean_IS"] = is_stat_mean
+            stats_dict["var_IS"] = is_stat_var
+
+        if self.config["use_kl_loss"]:
+            stats_dict["kl"] = torch.mean(
+                torch.stack(self.get_tower_stats("mean_kl_loss"))
+            )
+            stats_dict["KL_Coeff"] = self.kl_coeff
+
+        return convert_to_numpy(stats_dict)
+
+    @override(TorchPolicyV2)
+    def extra_action_out(
+        self,
+        input_dict: Dict[str, TensorType],
+        state_batches: List[TensorType],
+        model: TorchModelV2,
+        action_dist: TorchDistributionWrapper,
+    ) -> Dict[str, TensorType]:
+        return {SampleBatch.VF_PREDS: model.value_function()}
+
+    @override(TorchPolicyV2)
+    def postprocess_trajectory(
+        self,
+        sample_batch: SampleBatch,
+        other_agent_batches: Optional[Dict[Any, SampleBatch]] = None,
+        episode=None,
+    ):
+        # Call super's postprocess_trajectory first.
+        # sample_batch = super().postprocess_trajectory(
+        #    sample_batch, other_agent_batches, episode
+        # )
+
+        # Do all post-processing always with no_grad().
+        # Not using this here will introduce a memory leak
+        # in torch (issue #6962).
+        with torch.no_grad():
+            if not self.config["vtrace"]:
+                sample_batch = compute_gae_for_sample_batch(
+                    self, sample_batch, other_agent_batches, episode
+                )
+            else:
+                # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
+                # inside the loss for vtrace calculations.
+                sample_batch = compute_bootstrap_value(sample_batch, self)
+
+        return sample_batch
+
+    @override(TorchPolicyV2)
+    def extra_grad_process(
+        self, optimizer: "torch.optim.Optimizer", loss: TensorType
+    ) -> Dict[str, TensorType]:
+        return apply_grad_clipping(self, optimizer, loss)
+
+    @override(TorchPolicyV2)
+    def get_batch_divisibility_req(self) -> int:
+        return self.config["rollout_fragment_length"]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/default_appo_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/default_appo_rl_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec9bfa5ab0a36b5908826953b2d6010401a0541a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/default_appo_rl_module.py
@@ -0,0 +1,59 @@
+import abc
+from typing import Any, Dict, List, Tuple
+
+from ray.rllib.algorithms.ppo.default_ppo_rl_module import DefaultPPORLModule
+from ray.rllib.core.learner.utils import make_target_network
+from ray.rllib.core.models.base import ACTOR
+from ray.rllib.core.models.tf.encoder import ENCODER_OUT
+from ray.rllib.core.rl_module.apis import (
+    TARGET_NETWORK_ACTION_DIST_INPUTS,
+    TargetNetworkAPI,
+)
+from ray.rllib.utils.typing import NetworkType
+
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.util.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+class DefaultAPPORLModule(DefaultPPORLModule, TargetNetworkAPI, abc.ABC):
+    """Default RLModule used by APPO, if user does not specify a custom RLModule.
+
+    Users who want to train their RLModules with APPO may implement any RLModule
+    (or TorchRLModule) subclass as long as the custom class also implements the
+    `ValueFunctionAPI` (see ray.rllib.core.rl_module.apis.value_function_api.py)
+    and the `TargetNetworkAPI` (see
+    ray.rllib.core.rl_module.apis.target_network_api.py).
+    """
+
+    @override(TargetNetworkAPI)
+    def make_target_networks(self):
+        self._old_encoder = make_target_network(self.encoder)
+        self._old_pi = make_target_network(self.pi)
+
+    @override(TargetNetworkAPI)
+    def get_target_network_pairs(self) -> List[Tuple[NetworkType, NetworkType]]:
+        return [
+            (self.encoder, self._old_encoder),
+            (self.pi, self._old_pi),
+        ]
+
+    @override(TargetNetworkAPI)
+    def forward_target(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        old_pi_inputs_encoded = self._old_encoder(batch)[ENCODER_OUT][ACTOR]
+        old_action_dist_logits = self._old_pi(old_pi_inputs_encoded)
+        return {TARGET_NETWORK_ACTION_DIST_INPUTS: old_action_dist_logits}
+
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    @override(DefaultPPORLModule)
+    def get_non_inference_attributes(self) -> List[str]:
+        # Get the NON inference-only attributes from the parent class
+        # `PPOTorchRLModule`.
+        ret = super().get_non_inference_attributes()
+        # Add the two (APPO) target networks to it (NOT needed in
+        # inference-only mode).
+        ret += ["_old_encoder", "_old_pi"]
+        return ret
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1e025a80609d8bff4fed2839677ae85e39b88f2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2cc24c6d9c3841fd7cc2be830470e4be2f45f806
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_rl_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f096b901f806cae8f32dac1c89ffb4bb36a9ea2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_rl_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/default_appo_torch_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/default_appo_torch_rl_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50bd57b116861cafb917e2b583ee5eeed1f69d54
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/default_appo_torch_rl_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..62a4198952ecd3ee08947f57b2f5d18d9bd20232
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_learner.py
@@ -0,0 +1,234 @@
+"""Asynchronous Proximal Policy Optimization (APPO)
+
+The algorithm is described in [1] (under the name of "IMPACT"):
+
+Detailed documentation:
+https://docs.ray.io/en/master/rllib-algorithms.html#appo
+
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
+"""
+from typing import Dict
+
+from ray.rllib.algorithms.appo.appo import (
+    APPOConfig,
+    LEARNER_RESULTS_CURR_KL_COEFF_KEY,
+    LEARNER_RESULTS_KL_KEY,
+)
+from ray.rllib.algorithms.appo.appo_learner import APPOLearner
+from ray.rllib.algorithms.impala.torch.impala_torch_learner import IMPALATorchLearner
+from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import (
+    make_time_major,
+    vtrace_torch,
+)
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.learner import POLICY_LOSS_KEY, VF_LOSS_KEY, ENTROPY_KEY
+from ray.rllib.core.rl_module.apis import (
+    TARGET_NETWORK_ACTION_DIST_INPUTS,
+    TargetNetworkAPI,
+    ValueFunctionAPI,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import ModuleID, TensorType
+
+torch, nn = try_import_torch()
+
+
+class APPOTorchLearner(APPOLearner, IMPALATorchLearner):
+    """Implements APPO loss / update logic on top of IMPALATorchLearner."""
+
+    @override(IMPALATorchLearner)
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: APPOConfig,
+        batch: Dict,
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        module = self.module[module_id].unwrapped()
+        assert isinstance(module, TargetNetworkAPI)
+        assert isinstance(module, ValueFunctionAPI)
+
+        # TODO (sven): Now that we do the +1ts trick to be less vulnerable about
+        #  bootstrap values at the end of rollouts in the new stack, we might make
+        #  this a more flexible, configurable parameter for users, e.g.
+        #  `v_trace_seq_len` (independent of `rollout_fragment_length`). Separation
+        #  of concerns (sampling vs learning).
+        rollout_frag_or_episode_len = config.get_rollout_fragment_length()
+        recurrent_seq_len = batch.get("seq_lens")
+
+        loss_mask = batch[Columns.LOSS_MASK].float()
+        loss_mask_time_major = make_time_major(
+            loss_mask,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        size_loss_mask = torch.sum(loss_mask)
+
+        values = module.compute_values(
+            batch, embeddings=fwd_out.get(Columns.EMBEDDINGS)
+        )
+
+        action_dist_cls_train = module.get_train_action_dist_cls()
+        target_policy_dist = action_dist_cls_train.from_logits(
+            fwd_out[Columns.ACTION_DIST_INPUTS]
+        )
+
+        old_target_policy_dist = action_dist_cls_train.from_logits(
+            module.forward_target(batch)[TARGET_NETWORK_ACTION_DIST_INPUTS]
+        )
+        old_target_policy_actions_logp = old_target_policy_dist.logp(
+            batch[Columns.ACTIONS]
+        )
+        behaviour_actions_logp = batch[Columns.ACTION_LOGP]
+        target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS])
+
+        behaviour_actions_logp_time_major = make_time_major(
+            behaviour_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        target_actions_logp_time_major = make_time_major(
+            target_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        old_actions_logp_time_major = make_time_major(
+            old_target_policy_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        rewards_time_major = make_time_major(
+            batch[Columns.REWARDS],
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        values_time_major = make_time_major(
+            values,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        assert Columns.VALUES_BOOTSTRAPPED not in batch
+        # Use as bootstrap values the vf-preds in the next "batch row", except
+        # for the very last row (which doesn't have a next row), for which the
+        # bootstrap value does not matter b/c it has a +1ts value at its end
+        # anyways. So we chose an arbitrary item (for simplicity of not having to
+        # move new data to the device).
+        bootstrap_values = torch.cat(
+            [
+                values_time_major[0][1:],  # 0th ts values from "next row"
+                values_time_major[0][0:1],  # <- can use any arbitrary value here
+            ],
+            dim=0,
+        )
+
+        # The discount factor that is used should be gamma except for timesteps where
+        # the episode is terminated. In that case, the discount factor should be 0.
+        discounts_time_major = (
+            1.0
+            - make_time_major(
+                batch[Columns.TERMINATEDS],
+                trajectory_len=rollout_frag_or_episode_len,
+                recurrent_seq_len=recurrent_seq_len,
+            ).float()
+        ) * config.gamma
+
+        # Note that vtrace will compute the main loop on the CPU for better performance.
+        vtrace_adjusted_target_values, pg_advantages = vtrace_torch(
+            target_action_log_probs=old_actions_logp_time_major,
+            behaviour_action_log_probs=behaviour_actions_logp_time_major,
+            discounts=discounts_time_major,
+            rewards=rewards_time_major,
+            values=values_time_major,
+            bootstrap_values=bootstrap_values,
+            clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold,
+            clip_rho_threshold=config.vtrace_clip_rho_threshold,
+        )
+        pg_advantages = pg_advantages * loss_mask_time_major
+
+        # The policy gradients loss.
+        is_ratio = torch.clip(
+            torch.exp(behaviour_actions_logp_time_major - old_actions_logp_time_major),
+            0.0,
+            2.0,
+        )
+        logp_ratio = is_ratio * torch.exp(
+            target_actions_logp_time_major - behaviour_actions_logp_time_major
+        )
+
+        surrogate_loss = torch.minimum(
+            pg_advantages * logp_ratio,
+            pg_advantages
+            * torch.clip(logp_ratio, 1 - config.clip_param, 1 + config.clip_param),
+        )
+
+        if config.use_kl_loss:
+            action_kl = old_target_policy_dist.kl(target_policy_dist) * loss_mask
+            mean_kl_loss = torch.sum(action_kl) / size_loss_mask
+        else:
+            mean_kl_loss = 0.0
+        mean_pi_loss = -(torch.sum(surrogate_loss) / size_loss_mask)
+
+        # The baseline loss.
+        delta = values_time_major - vtrace_adjusted_target_values
+        vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major)
+        mean_vf_loss = vf_loss / size_loss_mask
+
+        # The entropy loss.
+        mean_entropy_loss = (
+            -torch.sum(target_policy_dist.entropy() * loss_mask) / size_loss_mask
+        )
+
+        # The summed weighted loss.
+        total_loss = (
+            mean_pi_loss
+            + (mean_vf_loss * config.vf_loss_coeff)
+            + (
+                mean_entropy_loss
+                * self.entropy_coeff_schedulers_per_module[
+                    module_id
+                ].get_current_value()
+            )
+            + (mean_kl_loss * self.curr_kl_coeffs_per_module[module_id])
+        )
+
+        # Log important loss stats.
+        self.metrics.log_dict(
+            {
+                POLICY_LOSS_KEY: mean_pi_loss,
+                VF_LOSS_KEY: mean_vf_loss,
+                ENTROPY_KEY: -mean_entropy_loss,
+                LEARNER_RESULTS_KL_KEY: mean_kl_loss,
+                LEARNER_RESULTS_CURR_KL_COEFF_KEY: (
+                    self.curr_kl_coeffs_per_module[module_id]
+                ),
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        # Return the total loss.
+        return total_loss
+
+    @override(APPOLearner)
+    def _update_module_kl_coeff(self, module_id: ModuleID, config: APPOConfig) -> None:
+        # Update the current KL value based on the recently measured value.
+        # Increase.
+        kl = convert_to_numpy(self.metrics.peek((module_id, LEARNER_RESULTS_KL_KEY)))
+        kl_coeff_var = self.curr_kl_coeffs_per_module[module_id]
+
+        if kl > 2.0 * config.kl_target:
+            # TODO (Kourosh) why not *2.0?
+            kl_coeff_var.data *= 1.5
+        # Decrease.
+        elif kl < 0.5 * config.kl_target:
+            kl_coeff_var.data *= 0.5
+
+        self.metrics.log_value(
+            (module_id, LEARNER_RESULTS_CURR_KL_COEFF_KEY),
+            kl_coeff_var.item(),
+            window=1,
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_rl_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae60657b2c95ea55f70bd079bad0d9b9a6d4001f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_rl_module.py
@@ -0,0 +1,13 @@
+# Backward compat import.
+from ray.rllib.algorithms.appo.torch.default_appo_torch_rl_module import (  # noqa
+    DefaultAPPOTorchRLModule as APPOTorchRLModule,
+)
+from ray.rllib.utils.deprecation import deprecation_warning
+
+
+deprecation_warning(
+    old="ray.rllib.algorithms.appo.torch.appo_torch_rl_module.APPOTorchRLModule",
+    new="ray.rllib.algorithms.appo.torch.default_appo_torch_rl_module."
+    "DefaultAPPOTorchRLModule",
+    error=False,
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/default_appo_torch_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/default_appo_torch_rl_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..637c3de89d29ed98a3460a86c058ee20fac6b813
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/default_appo_torch_rl_module.py
@@ -0,0 +1,10 @@
+from ray.rllib.algorithms.appo.default_appo_rl_module import DefaultAPPORLModule
+from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import (
+    DefaultPPOTorchRLModule,
+)
+from ray.util.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+class DefaultAPPOTorchRLModule(DefaultPPOTorchRLModule, DefaultAPPORLModule):
+    pass
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a4f1e66d0a917bfe7de64e5fbb820c6b2cf2aa7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/utils.py
@@ -0,0 +1,133 @@
+"""
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
+"""
+from collections import deque
+import random
+import threading
+import time
+
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.utils.annotations import OldAPIStack
+
+
+POLICY_SCOPE = "func"
+TARGET_POLICY_SCOPE = "target_func"
+
+
+class CircularBuffer:
+    """A circular batch-wise buffer as described in [1] for APPO.
+
+    The buffer holds at most N batches, which are sampled at random (uniformly).
+    If full and a new batch is added, the oldest batch is discarded. Also, each batch
+    currently in the buffer can be sampled at most K times (after which it is also
+    discarded).
+    """
+
+    def __init__(self, num_batches: int, iterations_per_batch: int):
+        # N from the paper (buffer size).
+        self.num_batches = num_batches
+        # K ("replay coefficient") from the paper.
+        self.iterations_per_batch = iterations_per_batch
+
+        self._buffer = deque(maxlen=self.num_batches)
+        self._lock = threading.Lock()
+
+        # The number of valid (not expired) entries in this buffer.
+        self._num_valid_batches = 0
+
+    def add(self, batch):
+        dropped_entry = None
+        dropped_ts = 0
+
+        # Add buffer and k=0 information to the deque.
+        with self._lock:
+            len_ = len(self._buffer)
+            if len_ == self.num_batches:
+                dropped_entry = self._buffer[0]
+            self._buffer.append([batch, 0])
+            self._num_valid_batches += 1
+
+        # A valid entry (w/ a batch whose k has not been reach K yet) was dropped.
+        if dropped_entry is not None and dropped_entry[0] is not None:
+            dropped_ts += dropped_entry[0].env_steps() * (
+                self.iterations_per_batch - dropped_entry[1]
+            )
+            self._num_valid_batches -= 1
+
+        return dropped_ts
+
+    def sample(self):
+        k = entry = batch = None
+
+        while True:
+            # Only initially, the buffer may be empty -> Just wait for some time.
+            if len(self) == 0:
+                time.sleep(0.001)
+                continue
+            # Sample a random buffer index.
+            with self._lock:
+                entry = self._buffer[random.randint(0, len(self._buffer) - 1)]
+            batch, k = entry
+            # Ignore batches that have already been invalidated.
+            if batch is not None:
+                break
+
+        # Increase k += 1 for this batch.
+        assert k is not None
+        entry[1] += 1
+
+        # This batch has been exhausted (k == K) -> Invalidate it in the buffer.
+        if k == self.iterations_per_batch - 1:
+            entry[0] = None
+            entry[1] = None
+            self._num_valid_batches += 1
+
+        # Return the sampled batch.
+        return batch
+
+    def __len__(self) -> int:
+        """Returns the number of actually valid (non-expired) batches in the buffer."""
+        return self._num_valid_batches
+
+
+@OldAPIStack
+def make_appo_models(policy) -> ModelV2:
+    """Builds model and target model for APPO.
+
+    Returns:
+        ModelV2: The Model for the Policy to use.
+            Note: The target model will not be returned, just assigned to
+            `policy.target_model`.
+    """
+    # Get the num_outputs for the following model construction calls.
+    _, logit_dim = ModelCatalog.get_action_dist(
+        policy.action_space, policy.config["model"]
+    )
+
+    # Construct the (main) model.
+    policy.model = ModelCatalog.get_model_v2(
+        policy.observation_space,
+        policy.action_space,
+        logit_dim,
+        policy.config["model"],
+        name=POLICY_SCOPE,
+        framework=policy.framework,
+    )
+    policy.model_variables = policy.model.variables()
+
+    # Construct the target model.
+    policy.target_model = ModelCatalog.get_model_v2(
+        policy.observation_space,
+        policy.action_space,
+        logit_dim,
+        policy.config["model"],
+        name=TARGET_POLICY_SCOPE,
+        framework=policy.framework,
+    )
+    policy.target_model_variables = policy.target_model.variables()
+
+    # Return only the model (not the target model).
+    return policy.model
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db5a0179a8a748d588ca27bc66cb55f50c8bd08d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68cbdbd64d1308fe10f1f063014075998fe895cc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_catalog.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_catalog.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..079c5c126cf71aa5a06aef003e98a2af6d040709
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_catalog.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2b8a5ad22b5e8bea5e2fab3e44b0eba9060e722
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_rl_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c91f1e4dc3279bf2b480d86ef4e8e633135ac8c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_rl_module.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba9e0c8d54b2804686807d3be42c62133ca5ed8c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/actor_network.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/actor_network.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1c3e3e919e8aa655c6f3685454476b8ab299eb7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/actor_network.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/critic_network.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/critic_network.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee5884634e4d2c4c187a4f34e8aa9202926f487b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/critic_network.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/disagree_networks.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/disagree_networks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..811f8b0071bdf62dae8fd1410049032b58aff8b9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/disagree_networks.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/dreamer_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/dreamer_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea5effa280b73ce346c0aaeda43caea6db343d89
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/dreamer_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/world_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/world_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d113329fa719fb91d3ec22c1eb24f30471faa65a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/world_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..537d0c493d2b07e4cb85c7dd7ee1aaccb96572e0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/cnn_atari.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/cnn_atari.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da4d5c20cc76a49429b6677344efa919129d7b6b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/cnn_atari.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/continue_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/continue_predictor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab985a244f2f365bb4170bc72856ac8434f56608
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/continue_predictor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/conv_transpose_atari.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/conv_transpose_atari.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0defad84e5a502bb2381f5c6dd6e0717b293961
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/conv_transpose_atari.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/dynamics_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/dynamics_predictor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e66ed73fb3301f87d29a69b49d698022540983d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/dynamics_predictor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/mlp.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/mlp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9ab1b129c590bbc7bc9f421bc1373d3c42f09c6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/mlp.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/representation_layer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/representation_layer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..906d5cf8a0f720a330b484f1c4a2e573c973c35f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/representation_layer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..acdc5132fd7f65279b1d1611207eb25b8a10cf74
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor_layer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor_layer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c3c4e2905b0e2997b857685b25d1e6549034e33
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor_layer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/sequence_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/sequence_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d60c5c39f813bf286717c64c48876b04a43b23e2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/sequence_model.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/vector_decoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/vector_decoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a6fc38a53c2f3d895bd4b03344a9c2ebe48a96e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/vector_decoder.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5434d8aca315df28f6d17ede500c6596df40681
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py
@@ -0,0 +1,94 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_gru_units,
+    get_num_z_classes,
+    get_num_z_categoricals,
+)
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
+
+
+class ContinuePredictor(tf.keras.Model):
+    """The world-model network sub-component used to predict the `continue` flags .
+
+    Predicted continue flags are used to produce "dream data" to learn the policy in.
+
+    The continue flags are predicted via a linear output used to parameterize a
+    Bernoulli distribution, from which simply the mode is used (no stochastic
+    sampling!). In other words, if the sigmoid of the output of the linear layer is
+    >0.5, we predict a continuation of the episode, otherwise we predict an episode
+    terminal.
+    """
+
+    def __init__(self, *, model_size: str = "XS"):
+        """Initializes a ContinuePredictor instance.
+
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Determines the exact size of the underlying MLP.
+        """
+        super().__init__(name="continue_predictor")
+        self.model_size = model_size
+        self.mlp = MLP(model_size=model_size, output_layer_size=1)
+
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type),
+                tf.TensorSpec(
+                    shape=[
+                        None,
+                        get_num_z_categoricals(model_size),
+                        get_num_z_classes(model_size),
+                    ],
+                    dtype=dl_type,
+                ),
+            ]
+        )(self.call)
+
+    def call(self, h, z):
+        """Performs a forward pass through the continue predictor.
+
+        Args:
+            h: The deterministic hidden state of the sequence model. [B, dim(h)].
+            z: The stochastic discrete representations of the original
+                observation input. [B, num_categoricals, num_classes].
+        """
+        # Flatten last two dims of z.
+        assert len(z.shape) == 3
+        z_shape = tf.shape(z)
+        z = tf.reshape(z, shape=(z_shape[0], -1))
+        assert len(z.shape) == 2
+        out = tf.concat([h, z], axis=-1)
+        out.set_shape(
+            [
+                None,
+                (
+                    get_num_z_categoricals(self.model_size)
+                    * get_num_z_classes(self.model_size)
+                    + get_gru_units(self.model_size)
+                ),
+            ]
+        )
+        # Send h-cat-z through MLP.
+        out = self.mlp(out)
+        # Remove the extra [B, 1] dimension at the end to get a proper Bernoulli
+        # distribution. Otherwise, tfp will think that the batch dims are [B, 1]
+        # where they should be just [B].
+        logits = tf.cast(tf.squeeze(out, axis=-1), tf.float32)
+        # Create the Bernoulli distribution object.
+        bernoulli = tfp.distributions.Bernoulli(logits=logits, dtype=tf.float32)
+
+        # Take the mode (greedy, deterministic "sample").
+        continue_ = bernoulli.mode()
+
+        # Return Bernoulli sample (whether to continue) OR (continue?, Bernoulli prob).
+        return continue_, bernoulli
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7daedf90ff5ab04d1da2ac0781b83a1210604b92
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py
@@ -0,0 +1,84 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from typing import Optional
+
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import (
+    RepresentationLayer,
+)
+from ray.rllib.algorithms.dreamerv3.utils import get_gru_units
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class DynamicsPredictor(tf.keras.Model):
+    """The dynamics (or "prior") network described in [1], producing prior z-states.
+
+    The dynamics net is used to:
+    - compute the initial z-state (from the tanh'd initial h-state variable) at the
+    beginning of a sequence.
+    - compute prior-z-states during dream data generation. Note that during dreaming,
+    no actual observations are available and thus no posterior z-states can be computed.
+    """
+
+    def __init__(
+        self,
+        *,
+        model_size: Optional[str] = "XS",
+        num_categoricals: Optional[int] = None,
+        num_classes_per_categorical: Optional[int] = None,
+    ):
+        """Initializes a DynamicsPredictor instance.
+
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Use None for manually setting the different parameters.
+            num_categoricals: Overrides the number of categoricals used in the z-states.
+                In [1], 32 is used for any model size.
+            num_classes_per_categorical: Overrides the number of classes within each
+                categorical used for the z-states. In [1], 32 is used for any model
+                dimension.
+        """
+        super().__init__(name="dynamics_predictor")
+
+        self.mlp = MLP(
+            # In author's original code, the Dynamics Net only has a single layer, no
+            # matter the model size.
+            num_dense_layers=1,
+            model_size=model_size,
+            output_layer_size=None,
+        )
+        # The (prior) z-state generating layer.
+        self.representation_layer = RepresentationLayer(
+            model_size=model_size,
+            num_categoricals=num_categoricals,
+            num_classes_per_categorical=num_classes_per_categorical,
+        )
+
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type),
+            ]
+        )(self.call)
+
+    def call(self, h):
+        """Performs a forward pass through the dynamics (or "prior") network.
+
+        Args:
+            h: The deterministic hidden state of the sequence model.
+
+        Returns:
+            Tuple consisting of a differentiable z-sample and the probabilities for the
+            categorical distribution (in the shape of [B, num_categoricals,
+            num_classes]) that created this sample.
+        """
+        # Send internal state through MLP.
+        out = self.mlp(h)
+        # Generate a z vector (stochastic, discrete sample).
+        return self.representation_layer(out)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/mlp.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..435d9f8544ab3e40d343f39edf49da1be3b48ad0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/mlp.py
@@ -0,0 +1,104 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from typing import Optional
+
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_dense_hidden_units,
+    get_num_dense_layers,
+)
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class MLP(tf.keras.Model):
+    """An MLP primitive used by several DreamerV3 components and described in [1] Fig 5.
+
+    MLP=multi-layer perceptron.
+
+    See Appendix B in [1] for the MLP sizes depending on the given `model_size`.
+    """
+
+    def __init__(
+        self,
+        *,
+        model_size: Optional[str] = "XS",
+        num_dense_layers: Optional[int] = None,
+        dense_hidden_units: Optional[int] = None,
+        output_layer_size=None,
+        trainable: bool = True,
+        name: Optional[str] = None
+    ):
+        """Initializes an MLP instance.
+
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Use None for manually setting the different network sizes.
+            num_dense_layers: The number of hidden layers in the MLP. If None,
+                will use `model_size` and appendix B to figure out this value.
+            dense_hidden_units: The number of nodes in each hidden layer. If None,
+                will use `model_size` and appendix B to figure out this value.
+            output_layer_size: The size of an optional linear (no activation) output
+                layer. If None, no output layer will be added on top of the MLP dense
+                stack.
+            trainable: Whether the MLP is trainable (updated by an optimizer) or not.
+            name: An optional name for the MLP keras model.
+        """
+        super().__init__(name=name or "mlp")
+
+        num_dense_layers = get_num_dense_layers(model_size, override=num_dense_layers)
+        dense_hidden_units = get_dense_hidden_units(
+            model_size, override=dense_hidden_units
+        )
+
+        self.dense_layers = []
+        for _ in range(num_dense_layers):
+            self.dense_layers.append(
+                tf.keras.layers.Dense(
+                    dense_hidden_units,
+                    trainable=trainable,
+                    # Use no biases, iff there is LayerNormalization
+                    # (which there always is), and perform the activation after the
+                    # layer normalization.
+                    activation=None,
+                    use_bias=False,
+                )
+            )
+
+        self.layer_normalizations = []
+        for _ in range(len(self.dense_layers)):
+            self.layer_normalizations.append(
+                tf.keras.layers.LayerNormalization(trainable=trainable)
+            )
+
+        self.output_layer = None
+        if output_layer_size:
+            self.output_layer = tf.keras.layers.Dense(
+                output_layer_size, activation=None, trainable=trainable
+            )
+
+    def call(self, input_):
+        """Performs a forward pass through this MLP.
+
+        Args:
+            input_: The input tensor for the MLP dense stack.
+        """
+        out = input_
+
+        for dense_layer, layer_norm in zip(
+            self.dense_layers, self.layer_normalizations
+        ):
+            # In this order: layer, normalization, activation.
+            out = tf.nn.silu(layer_norm(dense_layer(out)))
+
+        if self.output_layer is not None:
+            out = self.output_layer(out)
+
+        return out
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec344f470d5e095398dc77029275432249639288
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py
@@ -0,0 +1,130 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from typing import Optional
+
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_num_z_categoricals,
+    get_num_z_classes,
+)
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
+
+
+class RepresentationLayer(tf.keras.layers.Layer):
+    """A representation (z-state) generating layer.
+
+    The value for z is the result of sampling from a categorical distribution with
+    shape B x `num_classes`. So a computed z-state consists of `num_categoricals`
+    one-hot vectors, each of size `num_classes_per_categorical`.
+    """
+
+    def __init__(
+        self,
+        *,
+        model_size: Optional[str] = "XS",
+        num_categoricals: Optional[int] = None,
+        num_classes_per_categorical: Optional[int] = None,
+    ):
+        """Initializes a RepresentationLayer instance.
+
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Use None for manually setting the different parameters.
+            num_categoricals: Overrides the number of categoricals used in the z-states.
+                In [1], 32 is used for any model size.
+            num_classes_per_categorical: Overrides the number of classes within each
+                categorical used for the z-states. In [1], 32 is used for any model
+                dimension.
+        """
+        self.num_categoricals = get_num_z_categoricals(
+            model_size, override=num_categoricals
+        )
+        self.num_classes_per_categorical = get_num_z_classes(
+            model_size, override=num_classes_per_categorical
+        )
+
+        super().__init__(
+            name=f"z{self.num_categoricals}x{self.num_classes_per_categorical}"
+        )
+
+        self.z_generating_layer = tf.keras.layers.Dense(
+            self.num_categoricals * self.num_classes_per_categorical,
+            activation=None,
+        )
+
+    def call(self, inputs):
+        """Produces a discrete, differentiable z-sample from some 1D input tensor.
+
+        Pushes the input_ tensor through our dense layer, which outputs
+        32(B=num categoricals)*32(c=num classes) logits. Logits are used to:
+
+        1) sample stochastically
+        2) compute probs (via softmax)
+        3) make sure the sampling step is differentiable (see [2] Algorithm 1):
+            sample=one_hot(draw(logits))
+            probs=softmax(logits)
+            sample=sample + probs - stop_grad(probs)
+            -> Now sample has the gradients of the probs.
+
+        Args:
+            inputs: The input to our z-generating layer. This might be a) the combined
+                (concatenated) outputs of the (image?) encoder + the last hidden
+                deterministic state, or b) the output of the dynamics predictor MLP
+                network.
+
+        Returns:
+            Tuple consisting of a differentiable z-sample and the probabilities for the
+            categorical distribution (in the shape of [B, num_categoricals,
+            num_classes]) that created this sample.
+        """
+        # Compute the logits (no activation) for our `num_categoricals` Categorical
+        # distributions (with `num_classes_per_categorical` classes each).
+        logits = self.z_generating_layer(inputs)
+        # Reshape the logits to [B, num_categoricals, num_classes]
+        logits = tf.reshape(
+            logits,
+            shape=(-1, self.num_categoricals, self.num_classes_per_categorical),
+        )
+        # Compute the probs (based on logits) via softmax.
+        probs = tf.nn.softmax(tf.cast(logits, tf.float32))
+        # Add the unimix weighting (1% uniform) to the probs.
+        # See [1]: "Unimix categoricals: We parameterize the categorical distributions
+        # for the world model representations and dynamics, as well as for the actor
+        # network, as mixtures of 1% uniform and 99% neural network output to ensure
+        # a minimal amount of probability mass on every class and thus keep log
+        # probabilities and KL divergences well behaved."
+        probs = 0.99 * probs + 0.01 * (1.0 / self.num_classes_per_categorical)
+
+        # Danijar's code does: distr = [Distr class](logits=tf.log(probs)).
+        # Not sure why we don't directly use the already available probs instead.
+        logits = tf.math.log(probs)
+
+        # Create the distribution object using the unimix'd logits.
+        distribution = tfp.distributions.Independent(
+            tfp.distributions.OneHotCategorical(logits=logits),
+            reinterpreted_batch_ndims=1,
+        )
+
+        # Draw a one-hot sample (B, num_categoricals, num_classes).
+        sample = tf.cast(distribution.sample(), tf.float32)
+        # Make sure we can take gradients "straight-through" the sampling step
+        # by adding the probs and subtracting the sg(probs). Note that `sample`
+        # does not have any gradients as it's the result of a Categorical sample step,
+        # which is non-differentiable (other than say a Gaussian sample step).
+        # [1] "The representations are sampled from a vector of softmax distributions
+        # and we take straight-through gradients through the sampling step."
+        # [2] Algorithm 1.
+        differentiable_sample = tf.cast(
+            (tf.stop_gradient(sample) + probs - tf.stop_gradient(probs)),
+            tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32,
+        )
+        return differentiable_sample, probs
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e7cb6de93f97c4268d9067217b7be60624f0073
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py
@@ -0,0 +1,112 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import (
+    RewardPredictorLayer,
+)
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_gru_units,
+    get_num_z_categoricals,
+    get_num_z_classes,
+)
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class RewardPredictor(tf.keras.Model):
+    """Wrapper of MLP and RewardPredictorLayer to predict rewards for the world model.
+
+    Predicted rewards are used to produce "dream data" to learn the policy in.
+    """
+
+    def __init__(
+        self,
+        *,
+        model_size: str = "XS",
+        num_buckets: int = 255,
+        lower_bound: float = -20.0,
+        upper_bound: float = 20.0,
+    ):
+        """Initializes a RewardPredictor instance.
+
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Determines the exact size of the underlying MLP.
+            num_buckets: The number of buckets to create. Note that the number of
+                possible symlog'd outcomes from the used distribution is
+                `num_buckets` + 1:
+                lower_bound --bucket-- o[1] --bucket-- o[2] ... --bucket-- upper_bound
+                o=outcomes
+                lower_bound=o[0]
+                upper_bound=o[num_buckets]
+            lower_bound: The symlog'd lower bound for a possible reward value.
+                Note that a value of -20.0 here already allows individual (actual env)
+                rewards to be as low as -400M. Buckets will be created between
+                `lower_bound` and `upper_bound`.
+            upper_bound: The symlog'd upper bound for a possible reward value.
+                Note that a value of +20.0 here already allows individual (actual env)
+                rewards to be as high as 400M. Buckets will be created between
+                `lower_bound` and `upper_bound`.
+        """
+        super().__init__(name="reward_predictor")
+        self.model_size = model_size
+
+        self.mlp = MLP(
+            model_size=model_size,
+            output_layer_size=None,
+        )
+        self.reward_layer = RewardPredictorLayer(
+            num_buckets=num_buckets,
+            lower_bound=lower_bound,
+            upper_bound=upper_bound,
+        )
+
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type),
+                tf.TensorSpec(
+                    shape=[
+                        None,
+                        get_num_z_categoricals(model_size),
+                        get_num_z_classes(model_size),
+                    ],
+                    dtype=dl_type,
+                ),
+            ]
+        )(self.call)
+
+    def call(self, h, z):
+        """Computes the expected reward using N equal sized buckets of possible values.
+
+        Args:
+            h: The deterministic hidden state of the sequence model. [B, dim(h)].
+            z: The stochastic discrete representations of the original
+                observation input. [B, num_categoricals, num_classes].
+        """
+        # Flatten last two dims of z.
+        assert len(z.shape) == 3
+        z_shape = tf.shape(z)
+        z = tf.reshape(z, shape=(z_shape[0], -1))
+        assert len(z.shape) == 2
+        out = tf.concat([h, z], axis=-1)
+        out.set_shape(
+            [
+                None,
+                (
+                    get_num_z_categoricals(self.model_size)
+                    * get_num_z_classes(self.model_size)
+                    + get_gru_units(self.model_size)
+                ),
+            ]
+        )
+        # Send h-cat-z through MLP.
+        out = self.mlp(out)
+        # Return a) mean reward OR b) a tuple: (mean reward, logits over the reward
+        # buckets).
+        return self.reward_layer(out)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68f62cb6780f2ef044bb8f091c727b49b16e390
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py
@@ -0,0 +1,110 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class RewardPredictorLayer(tf.keras.layers.Layer):
+    """A layer outputting reward predictions using K bins and two-hot encoding.
+
+    This layer is used in two models in DreamerV3: The reward predictor of the world
+    model and the value function. K is 255 by default (see [1]) and doesn't change
+    with the model size.
+
+    Possible predicted reward/values range from symexp(-20.0) to symexp(20.0), which
+    should cover any possible environment. Outputs of this layer are generated by
+    generating logits/probs via a single linear layer, then interpreting the probs
+    as weights for a weighted average of the different possible reward (binned) values.
+    """
+
+    def __init__(
+        self,
+        *,
+        num_buckets: int = 255,
+        lower_bound: float = -20.0,
+        upper_bound: float = 20.0,
+        trainable: bool = True,
+    ):
+        """Initializes a RewardPredictorLayer instance.
+
+        Args:
+            num_buckets: The number of buckets to create. Note that the number of
+                possible symlog'd outcomes from the used distribution is
+                `num_buckets` + 1:
+                lower_bound --bucket-- o[1] --bucket-- o[2] ... --bucket-- upper_bound
+                o=outcomes
+                lower_bound=o[0]
+                upper_bound=o[num_buckets]
+            lower_bound: The symlog'd lower bound for a possible reward value.
+                Note that a value of -20.0 here already allows individual (actual env)
+                rewards to be as low as -400M. Buckets will be created between
+                `lower_bound` and `upper_bound`.
+            upper_bound: The symlog'd upper bound for a possible reward value.
+                Note that a value of +20.0 here already allows individual (actual env)
+                rewards to be as high as 400M. Buckets will be created between
+                `lower_bound` and `upper_bound`.
+        """
+        self.num_buckets = num_buckets
+        super().__init__(name=f"reward_layer_{self.num_buckets}buckets")
+
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        self.reward_buckets_layer = tf.keras.layers.Dense(
+            units=self.num_buckets,
+            activation=None,
+            # From [1]:
+            # "We further noticed that the randomly initialized reward predictor and
+            # critic networks at the start of training can result in large predicted
+            # rewards that can delay the onset of learning. We initialize the output
+            # weights of the reward predictor and critic to zeros, which effectively
+            # alleviates the problem and accelerates early learning."
+            kernel_initializer="zeros",
+            bias_initializer="zeros",  # zero-bias is default anyways
+            trainable=trainable,
+        )
+
+    def call(self, inputs):
+        """Computes the expected reward using N equal sized buckets of possible values.
+
+        Args:
+            inputs: The input tensor for the layer, which computes the reward bucket
+                weights (logits). [B, dim].
+
+        Returns:
+            A tuple consisting of the expected rewards and the logits that parameterize
+            the tfp `FiniteDiscrete` distribution object. To get the individual bucket
+            probs, do `[FiniteDiscrete object].probs`.
+        """
+        # Compute the `num_buckets` weights.
+        assert len(inputs.shape) == 2
+        logits = tf.cast(self.reward_buckets_layer(inputs), tf.float32)
+        # out=[B, `num_buckets`]
+
+        # Compute the expected(!) reward using the formula:
+        # `softmax(Linear(x))` [vectordot] `possible_outcomes`, where
+        # `possible_outcomes` is the even-spaced (binned) encoding of all possible
+        # symexp'd reward/values.
+        # [2]: "The mean of the reward predictor pφ(ˆrt | zˆt) is used as reward
+        # sequence rˆ1:H."
+        probs = tf.nn.softmax(logits)
+        possible_outcomes = tf.linspace(
+            self.lower_bound,
+            self.upper_bound,
+            self.num_buckets,
+        )
+        # probs=possible_outcomes=[B, `num_buckets`]
+
+        # Simple vector dot product (over last dim) to get the mean reward
+        # weighted sum, where all weights sum to 1.0.
+        expected_rewards = tf.reduce_sum(probs * possible_outcomes, axis=-1)
+        # expected_rewards=[B]
+
+        return expected_rewards, logits
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa9666029ce30de815b817e2c956a0bae97b816d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py
@@ -0,0 +1,144 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from typing import Optional
+
+import gymnasium as gym
+import numpy as np
+
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_gru_units,
+    get_num_z_classes,
+    get_num_z_categoricals,
+)
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class SequenceModel(tf.keras.Model):
+    """The "sequence model" of the RSSM, computing ht+1 given (ht, zt, at).
+
+    Note: The "internal state" always consists of:
+    The actions `a` (initially, this is a zeroed-out action), `h`-states (deterministic,
+    continuous), and `z`-states (stochastic, discrete).
+    There are two versions of z-states: "posterior" for world model training and "prior"
+    for creating the dream data.
+
+    Initial internal state values (`a`, `h`, and `z`) are used where ever a new episode
+    starts within a batch row OR at the beginning of each train batch's B rows,
+    regardless of whether there was an actual episode boundary or not. Thus, internal
+    states are not required to be stored in or retrieved from the replay buffer AND
+    retrieved batches from the buffer must not be zero padded.
+
+    Initial `a` is the zero "one hot" action, e.g. [0.0, 0.0] for Discrete(2), initial
+    `h` is a separate learned variable, and initial `z` are computed by the "dynamics"
+    (or "prior") net, using only the initial-h state as input.
+
+    The GRU in this SequenceModel always produces the next h-state, then.
+    """
+
+    def __init__(
+        self,
+        *,
+        model_size: Optional[str] = "XS",
+        action_space: gym.Space,
+        num_gru_units: Optional[int] = None,
+    ):
+        """Initializes a SequenceModel instance.
+
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Use None for manually setting the number of GRU units used.
+            action_space: The action space of the environment used.
+            num_gru_units: Overrides the number of GRU units (dimension of the h-state).
+                If None, use the value given through `model_size`
+                (see [1] Appendix B).
+        """
+        super().__init__(name="sequence_model")
+
+        self.model_size = model_size
+        self.action_space = action_space
+        num_gru_units = get_gru_units(self.model_size, override=num_gru_units)
+
+        # In Danijar's code, there is an additional layer (units=[model_size])
+        # prior to the GRU (but always only with 1 layer), which is not mentioned in
+        # the paper.
+        self.pre_gru_layer = MLP(
+            num_dense_layers=1,
+            model_size=self.model_size,
+            output_layer_size=None,
+        )
+        self.gru_unit = tf.keras.layers.GRU(
+            num_gru_units,
+            return_sequences=False,
+            return_state=False,
+            # Note: Changing these activations is most likely a bad idea!
+            # In experiments, setting one of both of them to silu deteriorated
+            # performance significantly.
+            # activation=tf.nn.silu,
+            # recurrent_activation=tf.nn.silu,
+        )
+
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(
+                    shape=[None]
+                    + (
+                        [action_space.n]
+                        if isinstance(action_space, gym.spaces.Discrete)
+                        else list(action_space.shape)
+                    ),
+                    dtype=dl_type,
+                ),
+                tf.TensorSpec(shape=[None, num_gru_units], dtype=dl_type),
+                tf.TensorSpec(
+                    shape=[
+                        None,
+                        get_num_z_categoricals(self.model_size),
+                        get_num_z_classes(self.model_size),
+                    ],
+                    dtype=dl_type,
+                ),
+            ]
+        )(self.call)
+
+    def call(self, a, h, z):
+        """
+
+        Args:
+            a: The previous action (already one-hot'd if applicable). (B, ...).
+            h: The previous deterministic hidden state of the sequence model.
+                (B, num_gru_units)
+            z: The previous stochastic discrete representations of the original
+                observation input. (B, num_categoricals, num_classes_per_categorical).
+        """
+        # Flatten last two dims of z.
+        z_shape = tf.shape(z)
+        z = tf.reshape(z, shape=(z_shape[0], -1))
+        out = tf.concat([z, a], axis=-1)
+        out.set_shape(
+            [
+                None,
+                (
+                    get_num_z_categoricals(self.model_size)
+                    * get_num_z_classes(self.model_size)
+                    + (
+                        self.action_space.n
+                        if isinstance(self.action_space, gym.spaces.Discrete)
+                        else int(np.prod(self.action_space.shape))
+                    )
+                ),
+            ]
+        )
+        # Pass through pre-GRU layer.
+        out = self.pre_gru_layer(out)
+        # Pass through (batch-major) GRU (expand axis=1 as the time axis).
+        h_next = self.gru_unit(tf.expand_dims(out, axis=1), initial_state=h)
+        # Return the GRU's output (the next h-state).
+        return h_next
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe7b58cf515ee0c01b83dac8b515a07489c11c91
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__init__.py
@@ -0,0 +1,168 @@
+"""
+Utility functions for the DreamerV3 ([1]) algorithm.
+
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+
+_ALLOWED_MODEL_DIMS = [
+    # RLlib debug sizes (not mentioned in [1]).
+    "nano",
+    "micro",
+    "mini",
+    "XXS",
+    # Regular sizes (listed in table B in [1]).
+    "XS",
+    "S",
+    "M",
+    "L",
+    "XL",
+]
+
+
+def get_cnn_multiplier(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    cnn_multipliers = {
+        "nano": 2,
+        "micro": 4,
+        "mini": 8,
+        "XXS": 16,
+        "XS": 24,
+        "S": 32,
+        "M": 48,
+        "L": 64,
+        "XL": 96,
+    }
+    return cnn_multipliers[model_size]
+
+
+def get_dense_hidden_units(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    dense_units = {
+        "nano": 16,
+        "micro": 32,
+        "mini": 64,
+        "XXS": 128,
+        "XS": 256,
+        "S": 512,
+        "M": 640,
+        "L": 768,
+        "XL": 1024,
+    }
+    return dense_units[model_size]
+
+
+def get_gru_units(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    gru_units = {
+        "nano": 16,
+        "micro": 32,
+        "mini": 64,
+        "XXS": 128,
+        "XS": 256,
+        "S": 512,
+        "M": 1024,
+        "L": 2048,
+        "XL": 4096,
+    }
+    return gru_units[model_size]
+
+
+def get_num_z_categoricals(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    gru_units = {
+        "nano": 4,
+        "micro": 8,
+        "mini": 16,
+        "XXS": 32,
+        "XS": 32,
+        "S": 32,
+        "M": 32,
+        "L": 32,
+        "XL": 32,
+    }
+    return gru_units[model_size]
+
+
+def get_num_z_classes(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    gru_units = {
+        "nano": 4,
+        "micro": 8,
+        "mini": 16,
+        "XXS": 32,
+        "XS": 32,
+        "S": 32,
+        "M": 32,
+        "L": 32,
+        "XL": 32,
+    }
+    return gru_units[model_size]
+
+
+def get_num_curiosity_nets(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    num_curiosity_nets = {
+        "nano": 8,
+        "micro": 8,
+        "mini": 8,
+        "XXS": 8,
+        "XS": 8,
+        "S": 8,
+        "M": 8,
+        "L": 8,
+        "XL": 8,
+    }
+    return num_curiosity_nets[model_size]
+
+
+def get_num_dense_layers(model_size, override=None):
+    if override is not None:
+        return override
+
+    assert model_size in _ALLOWED_MODEL_DIMS
+    num_dense_layers = {
+        "nano": 1,
+        "micro": 1,
+        "mini": 1,
+        "XXS": 1,
+        "XS": 1,
+        "S": 2,
+        "M": 3,
+        "L": 4,
+        "XL": 5,
+    }
+    return num_dense_layers[model_size]
+
+
+def do_symlog_obs(observation_space, symlog_obs_user_setting):
+    # If our symlog_obs setting is NOT set specifically (it's set to "auto"), return
+    # True if we don't have an image observation space, otherwise return False.
+
+    # TODO (sven): Support mixed observation spaces.
+
+    is_image_space = len(observation_space.shape) in [2, 3]
+    return (
+        not is_image_space
+        if symlog_obs_user_setting == "auto"
+        else symlog_obs_user_setting
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9393df4c3a8af2417004658b1d27dbc7b1f4a5fb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/debugging.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/debugging.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a2f73d569ddc398c568080cdd0e4c6570181059
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/debugging.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/env_runner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/env_runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..179b333f6f91f763fc37fa02f71d96fddfe6db26
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/env_runner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/summaries.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/summaries.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2309af589b96456fb235432d0bc68e60cd592b5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/summaries.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/debugging.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/debugging.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ddbd8341ddb883a8be02ec7db90733141800a89
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/debugging.py
@@ -0,0 +1,190 @@
+import gymnasium as gym
+import numpy as np
+from PIL import Image, ImageDraw
+
+from gymnasium.envs.classic_control.cartpole import CartPoleEnv
+
+from ray.rllib.utils.framework import try_import_tf
+
+_, tf, _ = try_import_tf()
+
+
+class CartPoleDebug(CartPoleEnv):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        low = np.concatenate([np.array([0.0]), self.observation_space.low])
+        high = np.concatenate([np.array([1000.0]), self.observation_space.high])
+
+        self.observation_space = gym.spaces.Box(low, high, shape=(5,), dtype=np.float32)
+
+        self.timesteps_ = 0
+        self._next_action = 0
+        self._seed = 1
+
+    def reset(self, *, seed=None, options=None):
+        ret = super().reset(seed=self._seed)
+        self._seed += 1
+        self.timesteps_ = 0
+        self._next_action = 0
+        obs = np.concatenate([np.array([self.timesteps_]), ret[0]])
+        return obs, ret[1]
+
+    def step(self, action):
+        ret = super().step(self._next_action)
+
+        self.timesteps_ += 1
+        self._next_action = 0 if self._next_action else 1
+
+        obs = np.concatenate([np.array([self.timesteps_]), ret[0]])
+        reward = 0.1 * self.timesteps_
+        return (obs, reward) + ret[2:]
+
+
+gym.register("CartPoleDebug-v0", CartPoleDebug)
+cartpole_env = gym.make("CartPoleDebug-v0", render_mode="rgb_array")
+cartpole_env.reset()
+
+frozenlake_env = gym.make(
+    "FrozenLake-v1", render_mode="rgb_array", is_slippery=False, map_name="4x4"
+)  # desc=["SF", "HG"])
+frozenlake_env.reset()
+
+
+def create_cartpole_dream_image(
+    dreamed_obs,  # real space (not symlog'd)
+    dreamed_V,  # real space (not symlog'd)
+    dreamed_a,
+    dreamed_r_tp1,  # real space (not symlog'd)
+    dreamed_ri_tp1,  # intrinsic reward
+    dreamed_c_tp1,  # continue flag
+    value_target,  # real space (not symlog'd)
+    initial_h,
+    as_tensor=False,
+):
+    # CartPoleDebug
+    if dreamed_obs.shape == (5,):
+        # Set the state of our env to the given observation.
+        cartpole_env.unwrapped.state = np.array(dreamed_obs[1:], dtype=np.float32)
+    # Normal CartPole-v1
+    else:
+        cartpole_env.unwrapped.state = np.array(dreamed_obs, dtype=np.float32)
+
+    # Produce an RGB-image of the current state.
+    rgb_array = cartpole_env.render()
+
+    # Add value-, action-, reward-, and continue-prediction information.
+    image = Image.fromarray(rgb_array)
+    draw_obj = ImageDraw.Draw(image)
+
+    # fnt = ImageFont.load_default(size=40)
+
+    draw_obj.text(
+        (5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0)
+    )  # , font=fnt.font, size=30)
+    draw_obj.text(
+        (5, 18),
+        f"at={'<--' if dreamed_a == 0 else '-->'} ({dreamed_a})",
+        fill=(0, 0, 0),
+    )
+    draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0))
+    if dreamed_ri_tp1 is not None:
+        draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0))
+    draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0))
+    draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0))
+
+    if dreamed_obs.shape == (5,):
+        draw_obj.text((20, 100), f"t={dreamed_obs[0]}", fill=(0, 0, 0))
+
+    # Return image.
+    np_img = np.asarray(image)
+    if as_tensor:
+        return tf.convert_to_tensor(np_img, dtype=tf.uint8)
+    return np_img
+
+
+def create_frozenlake_dream_image(
+    dreamed_obs,  # real space (not symlog'd)
+    dreamed_V,  # real space (not symlog'd)
+    dreamed_a,
+    dreamed_r_tp1,  # real space (not symlog'd)
+    dreamed_ri_tp1,  # intrinsic reward
+    dreamed_c_tp1,  # continue flag
+    value_target,  # real space (not symlog'd)
+    initial_h,
+    as_tensor=False,
+):
+    frozenlake_env.unwrapped.s = np.argmax(dreamed_obs, axis=0)
+
+    # Produce an RGB-image of the current state.
+    rgb_array = frozenlake_env.render()
+
+    # Add value-, action-, reward-, and continue-prediction information.
+    image = Image.fromarray(rgb_array)
+    draw_obj = ImageDraw.Draw(image)
+
+    draw_obj.text((5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0))
+    action_arrow = (
+        "<--"
+        if dreamed_a == 0
+        else "v"
+        if dreamed_a == 1
+        else "-->"
+        if dreamed_a == 2
+        else "^"
+    )
+    draw_obj.text((5, 18), f"at={action_arrow} ({dreamed_a})", fill=(0, 0, 0))
+    draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0))
+    if dreamed_ri_tp1 is not None:
+        draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0))
+    draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0))
+    draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0))
+
+    # Return image.
+    np_img = np.asarray(image)
+    if as_tensor:
+        return tf.convert_to_tensor(np_img, dtype=tf.uint8)
+    return np_img
+
+
+if __name__ == "__main__":
+    # CartPole debug.
+    rgb_array = create_cartpole_dream_image(
+        dreamed_obs=np.array([100.0, 1.0, -0.01, 1.5, 0.02]),
+        dreamed_V=4.3,
+        dreamed_a=1,
+        dreamed_r_tp1=1.0,
+        dreamed_c_tp1=True,
+        initial_h=0.0,
+        value_target=8.0,
+    )
+    # ImageFont.load("arial.pil")
+    image = Image.fromarray(rgb_array)
+    image.show()
+
+    # Normal CartPole.
+    rgb_array = create_cartpole_dream_image(
+        dreamed_obs=np.array([1.0, -0.01, 1.5, 0.02]),
+        dreamed_V=4.3,
+        dreamed_a=1,
+        dreamed_r_tp1=1.0,
+        dreamed_c_tp1=True,
+        initial_h=0.1,
+        value_target=8.0,
+    )
+    # ImageFont.load("arial.pil")
+    image = Image.fromarray(rgb_array)
+    image.show()
+
+    # Frozenlake
+    rgb_array = create_frozenlake_dream_image(
+        dreamed_obs=np.array([1.0] + [0.0] * (frozenlake_env.observation_space.n - 1)),
+        dreamed_V=4.3,
+        dreamed_a=1,
+        dreamed_r_tp1=1.0,
+        dreamed_c_tp1=True,
+        initial_h=0.1,
+        value_target=8.0,
+    )
+    image = Image.fromarray(rgb_array)
+    image.show()
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/env_runner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/env_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..62932738fc1f8bc0ba033f57b8a79bd7cf267b77
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/env_runner.py
@@ -0,0 +1,694 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from collections import defaultdict
+from functools import partial
+from typing import Collection, List, Optional, Tuple, Union
+
+import gymnasium as gym
+from gymnasium.wrappers.vector import DictInfoToList
+import numpy as np
+import tree  # pip install dm_tree
+
+import ray
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core import COMPONENT_RL_MODULE, DEFAULT_AGENT_ID, DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.env import INPUT_ENV_SPACES
+from ray.rllib.env.env_runner import EnvRunner
+from ray.rllib.env.single_agent_episode import SingleAgentEpisode
+from ray.rllib.env.wrappers.atari_wrappers import NoopResetEnv, MaxAndSkipEnv
+from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv
+from ray.rllib.env.utils import _gym_env_creator
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.deprecation import Deprecated
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.metrics import (
+    EPISODE_DURATION_SEC_MEAN,
+    EPISODE_LEN_MAX,
+    EPISODE_LEN_MEAN,
+    EPISODE_LEN_MIN,
+    EPISODE_RETURN_MAX,
+    EPISODE_RETURN_MEAN,
+    EPISODE_RETURN_MIN,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_AGENT_STEPS_SAMPLED_LIFETIME,
+    NUM_EPISODES,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    NUM_MODULE_STEPS_SAMPLED,
+    NUM_MODULE_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+from ray.rllib.utils.numpy import convert_to_numpy, one_hot
+from ray.rllib.utils.spaces.space_utils import batch, unbatch
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+from ray.rllib.utils.typing import ResultDict, StateDict
+from ray.tune.registry import ENV_CREATOR, _global_registry
+
+_, tf, _ = try_import_tf()
+torch, _ = try_import_torch()
+
+
+# TODO (sven): Use SingleAgentEnvRunner instead of this as soon as we have the new
+#  ConnectorV2 example classes to make Atari work properly with these (w/o requiring the
+#  classes at the bottom of this file here, e.g. `ActionClip`).
+class DreamerV3EnvRunner(EnvRunner):
+    """An environment runner to collect data from vectorized gymnasium environments."""
+
+    def __init__(
+        self,
+        config: AlgorithmConfig,
+        **kwargs,
+    ):
+        """Initializes a DreamerV3EnvRunner instance.
+
+        Args:
+            config: The config to use to setup this EnvRunner.
+        """
+        super().__init__(config=config)
+
+        # Create the gym.vector.Env object.
+        # Atari env.
+        if self.config.env.startswith("ale_py:ALE/"):
+            # TODO (sven): This import currently causes a Tune test to fail. Either way,
+            #  we need to figure out how to properly setup the CI environment with
+            #  the correct versions of all gymnasium-related packages.
+            from supersuit.generic_wrappers import resize_v1
+
+            # [2]: "We down-scale the 84 × 84 grayscale images to 64 × 64 pixels so that
+            # we can apply the convolutional architecture of DreamerV1."
+            # ...
+            # "We follow the evaluation protocol of Machado et al. (2018) with 200M
+            # environment steps, action repeat of 4, a time limit of 108,000 steps per
+            # episode that correspond to 30 minutes of game play, no access to life
+            # information, full action space, and sticky actions. Because the world
+            # model integrates information over time, DreamerV2 does not use frame
+            # stacking."
+            # However, in Danijar's repo, Atari100k experiments are configured as:
+            # noop=30, 64x64x3 (no grayscaling), sticky actions=False,
+            # full action space=False,
+
+            def _entry_point():
+                return gym.make(
+                    self.config.env,
+                    **dict(
+                        self.config.env_config,
+                        **{
+                            # "sticky actions" but not according to Danijar's 100k
+                            # configs.
+                            "repeat_action_probability": 0.0,
+                            # "full action space" but not according to Danijar's 100k
+                            # configs.
+                            "full_action_space": False,
+                            # Already done by MaxAndSkip wrapper: "action repeat" == 4.
+                            "frameskip": 1,
+                        },
+                    ),
+                )
+
+            gym.register("rllib-single-agent-env-v0", entry_point=_entry_point)
+
+            self.env = DictInfoToList(
+                gym.make_vec(
+                    "rllib-single-agent-env-v0",
+                    num_envs=self.config.num_envs_per_env_runner,
+                    vectorization_mode=(
+                        "async" if self.config.remote_worker_envs else "sync"
+                    ),
+                    wrappers=[
+                        partial(gym.wrappers.TimeLimit, max_episode_steps=108000),
+                        partial(resize_v1, x_size=64, y_size=64),  # resize to 64x64
+                        NormalizedImageEnv,
+                        NoopResetEnv,
+                        MaxAndSkipEnv,
+                    ],
+                )
+            )
+        # DeepMind Control.
+        elif self.config.env.startswith("DMC/"):
+            parts = self.config.env.split("/")
+            assert len(parts) == 3, (
+                "ERROR: DMC env must be formatted as 'DMC/[task]/[domain]', e.g. "
+                f"'DMC/cartpole/swingup'! You provided '{self.config.env}'."
+            )
+            gym.register(
+                "dmc_env-v0",
+                lambda from_pixels=True: DMCEnv(
+                    parts[1], parts[2], from_pixels=from_pixels, channels_first=False
+                ),
+            )
+            self.env = DictInfoToList(
+                gym.make_vec(
+                    "dmc_env-v0",
+                    wrappers=[ActionClip],
+                    num_envs=self.config.num_envs_per_env_runner,
+                    vectorization_mode=(
+                        "async" if self.config.remote_worker_envs else "sync"
+                    ),
+                    **dict(self.config.env_config),
+                )
+            )
+        # All other envs (gym or `tune.register_env()`'d by the user).
+        else:
+            # Register the env in this local context here.
+            gym.register(
+                "dreamerv3-custom-env-v0",
+                partial(
+                    _global_registry.get(ENV_CREATOR, self.config.env),
+                    self.config.env_config,
+                )
+                if _global_registry.contains(ENV_CREATOR, self.config.env)
+                else partial(
+                    _gym_env_creator,
+                    env_context=self.config.env_config,
+                    env_descriptor=self.config.env,
+                ),
+            )
+            # Wrap into `DictInfoToList` wrapper to get infos as lists.
+            self.env = DictInfoToList(
+                gym.make_vec(
+                    "dreamerv3-custom-env-v0",
+                    num_envs=self.config.num_envs_per_env_runner,
+                    vectorization_mode=(
+                        "async" if self.config.remote_worker_envs else "sync"
+                    ),
+                )
+            )
+        self.num_envs = self.env.num_envs
+        assert self.num_envs == self.config.num_envs_per_env_runner
+
+        # Create our RLModule to compute actions with.
+        policy_dict, _ = self.config.get_multi_agent_setup(env=self.env)
+        self.multi_rl_module_spec = self.config.get_multi_rl_module_spec(
+            policy_dict=policy_dict
+        )
+        if self.config.share_module_between_env_runner_and_learner:
+            # DreamerV3 Algorithm will set this to the local Learner's module.
+            self.module = None
+        # Create our own instance of a DreamerV3RLModule (which then needs to be
+        # weight-synched each iteration).
+        else:
+            # TODO (sven): DreamerV3 is currently single-agent only.
+            self.module = self.multi_rl_module_spec.build()[DEFAULT_MODULE_ID]
+
+        self._cached_to_module = None
+
+        self.metrics = MetricsLogger()
+
+        self._device = None
+        if (
+            torch
+            and torch.cuda.is_available()
+            and self.config.framework_str == "torch"
+            and self.config.share_module_between_env_runner_and_learner
+            and self.config.num_gpus_per_learner > 0
+        ):
+            gpu_ids = ray.get_gpu_ids()
+            self._device = f"cuda:{gpu_ids[0]}"
+        self.convert_to_tensor = (
+            partial(convert_to_torch_tensor, device=self._device)
+            if self.config.framework_str == "torch"
+            else tf.convert_to_tensor
+        )
+
+        self._needs_initial_reset = True
+        self._episodes = [None for _ in range(self.num_envs)]
+        self._states = [None for _ in range(self.num_envs)]
+
+        # TODO (sven): Move metrics temp storage and collection out of EnvRunner
+        #  and RolloutWorkers. These classes should not continue tracking some data
+        #  that they have already returned (in a call to `sample()`). Instead, the
+        #  episode data should be analyzed where it was sent to (the Algorithm itself
+        #  via its replay buffer, etc..).
+        self._done_episodes_for_metrics = []
+        self._ongoing_episodes_for_metrics = defaultdict(list)
+
+    @override(EnvRunner)
+    def sample(
+        self,
+        *,
+        num_timesteps: int = None,
+        num_episodes: int = None,
+        explore: bool = True,
+        random_actions: bool = False,
+    ) -> Tuple[List[SingleAgentEpisode], List[SingleAgentEpisode]]:
+        """Runs and returns a sample (n timesteps or m episodes) on the environment(s).
+
+        Timesteps or episodes are counted in total (across all vectorized
+        sub-environments). For example, if self.num_envs=2 and num_timesteps=10, each
+        sub-environment will be sampled for 5 steps. If self.num_envs=3 and
+        num_episodes=30, each sub-environment will be sampled for 10 episodes.
+
+        Args:
+            num_timesteps: The number of timesteps to sample from the environment(s).
+                Note that only exactly one of `num_timesteps` or `num_episodes` must be
+                provided.
+            num_episodes: The number of full episodes to sample from the environment(s).
+                Note that only exactly one of `num_timesteps` or `num_episodes` must be
+                provided.
+            explore: Indicates whether to utilize exploration when picking actions.
+            random_actions: Whether to only use random actions. If True, the value of
+                `explore` is ignored.
+            force_reset: Whether to reset the environment(s) before starting to sample.
+                If False, will still reset the environment(s) if they were left in
+                a terminated or truncated state during previous sample calls.
+
+        Returns:
+            A tuple consisting of a) list of Episode instances that are done and
+            b) list of Episode instances that are still ongoing.
+        """
+        # If no execution details are provided, use self.config.
+        if num_timesteps is None and num_episodes is None:
+            if self.config.batch_mode == "truncate_episodes":
+                num_timesteps = self.config.rollout_fragment_length * self.num_envs
+            else:
+                num_episodes = self.num_envs
+
+        # Sample n timesteps.
+        if num_timesteps is not None:
+            return self._sample(
+                num_timesteps=num_timesteps,
+                explore=explore,
+                random_actions=random_actions,
+                force_reset=False,
+            )
+        # Sample n episodes.
+        else:
+            # `_sample_episodes` returns only one list (with completed episodes)
+            # return empty list for incomplete ones.
+            return (
+                self._sample(
+                    num_episodes=num_episodes,
+                    explore=explore,
+                    random_actions=random_actions,
+                ),
+                [],
+            )
+
+    def _sample(
+        self,
+        *,
+        num_timesteps: Optional[int] = None,
+        num_episodes: Optional[int] = None,
+        explore: bool = True,
+        random_actions: bool = False,
+        force_reset: bool = False,
+    ) -> List[SingleAgentEpisode]:
+        """Helper method to sample n timesteps or m episodes."""
+
+        done_episodes_to_return: List[SingleAgentEpisode] = []
+
+        # Get initial states for all `batch_size_B` rows in the forward batch.
+        initial_states = tree.map_structure(
+            lambda s: np.repeat(s, self.num_envs, axis=0),
+            convert_to_numpy(self.module.get_initial_state()),
+        )
+
+        # Have to reset the env (on all vector sub-envs).
+        if force_reset or num_episodes is not None or self._needs_initial_reset:
+            episodes = self._episodes = [None for _ in range(self.num_envs)]
+            self._reset_envs(episodes, initial_states)
+            # We just reset the env. Don't have to force this again in the next
+            # call to `self._sample()`.
+            self._needs_initial_reset = False
+
+            # Set initial obs and states in the episodes.
+            for i in range(self.num_envs):
+                self._states[i] = None
+        else:
+            episodes = self._episodes
+
+        # Loop through `num_timesteps` timesteps or `num_episodes` episodes.
+        ts = 0
+        eps = 0
+        while (
+            (ts < num_timesteps) if num_timesteps is not None else (eps < num_episodes)
+        ):
+            # Act randomly.
+            if random_actions:
+                actions = self.env.action_space.sample()
+            # Compute an action using the RLModule.
+            else:
+                # Env-to-module connector (already cached).
+                to_module = self._cached_to_module
+                assert to_module is not None
+                self._cached_to_module = None
+
+                # RLModule forward pass: Explore or not.
+                if explore:
+                    to_env = self.module.forward_exploration(to_module)
+                else:
+                    to_env = self.module.forward_inference(to_module)
+
+                # Model outputs one-hot actions (if discrete). Convert to int actions
+                # as well.
+                actions = convert_to_numpy(to_env[Columns.ACTIONS])
+                if isinstance(self.env.single_action_space, gym.spaces.Discrete):
+                    actions = np.argmax(actions, axis=-1)
+                self._states = unbatch(convert_to_numpy(to_env[Columns.STATE_OUT]))
+
+            observations, rewards, terminateds, truncateds, infos = self.env.step(
+                actions
+            )
+
+            call_on_episode_start = set()
+            for env_index in range(self.num_envs):
+                # Episode has no data in it yet -> Was just reset and needs to be called
+                # with its `add_env_reset()` method.
+                if not episodes[env_index].is_reset:
+                    episodes[env_index].add_env_reset(
+                        observation=observations[env_index],
+                        infos=infos[env_index],
+                    )
+                    call_on_episode_start.add(env_index)
+                    self._states[env_index] = None
+
+                # Call `add_env_step()` method on episode.
+                else:
+                    # Only increase ts when we actually stepped (not reset'd as a reset
+                    # does not count as a timestep).
+                    ts += 1
+                    episodes[env_index].add_env_step(
+                        observation=observations[env_index],
+                        action=actions[env_index],
+                        reward=rewards[env_index],
+                        infos=infos[env_index],
+                        terminated=terminateds[env_index],
+                        truncated=truncateds[env_index],
+                    )
+
+            # Cache results as we will do the RLModule forward pass only in the next
+            # `while`-iteration.
+            if self.module is not None:
+                is_first = np.zeros((self.num_envs,))
+                for env_index, episode in enumerate(episodes):
+                    if self._states[env_index] is None:
+                        is_first[env_index] = 1.0
+                        self._states[env_index] = {
+                            k: s[env_index] for k, s in initial_states.items()
+                        }
+                self._cached_to_module = {
+                    Columns.STATE_IN: tree.map_structure(
+                        lambda s: self.convert_to_tensor(s), batch(self._states)
+                    ),
+                    Columns.OBS: self.convert_to_tensor(observations),
+                    "is_first": self.convert_to_tensor(is_first),
+                }
+
+        for env_index in range(self.num_envs):
+            # Episode is not done.
+            if not episodes[env_index].is_done:
+                continue
+
+            eps += 1
+
+            # Then numpy'ize the episode.
+            done_episodes_to_return.append(episodes[env_index].to_numpy())
+
+            # Also early-out if we reach the number of episodes within this
+            # for-loop.
+            if eps == num_episodes:
+                break
+
+            # Create a new episode object with no data in it and execute
+            # `on_episode_created` callback (before the `env.reset()` call).
+            episodes[env_index] = SingleAgentEpisode(
+                observation_space=self.env.single_observation_space,
+                action_space=self.env.single_action_space,
+            )
+
+        # Return done episodes ...
+        # TODO (simon): Check, how much memory this attribute uses.
+        self._done_episodes_for_metrics.extend(done_episodes_to_return)
+        # ... and all ongoing episode chunks.
+
+        # Also, make sure we start new episode chunks (continuing the ongoing episodes
+        # from the to-be-returned chunks).
+        ongoing_episodes_to_return = []
+        # Only if we are doing individual timesteps: We have to maybe cut an ongoing
+        # episode and continue building it on the next call to `sample()`.
+        if num_timesteps is not None:
+            ongoing_episodes_continuations = [
+                episode.cut(len_lookback_buffer=self.config.episode_lookback_horizon)
+                for episode in episodes
+            ]
+
+            for episode in episodes:
+                # Just started Episodes do not have to be returned. There is no data
+                # in them anyway.
+                if episode.t == 0:
+                    continue
+                episode.validate()
+                self._ongoing_episodes_for_metrics[episode.id_].append(episode)
+                # Return numpy'ized Episodes.
+                ongoing_episodes_to_return.append(episode.to_numpy())
+
+            # Continue collecting into the cut Episode chunks.
+            self._episodes = ongoing_episodes_continuations
+
+        self._increase_sampled_metrics(ts)
+
+        # Return collected episode data.
+        return done_episodes_to_return + ongoing_episodes_to_return
+
+    def get_spaces(self):
+        return {
+            INPUT_ENV_SPACES: (self.env.observation_space, self.env.action_space),
+            DEFAULT_MODULE_ID: (
+                self.env.single_observation_space,
+                self.env.single_action_space,
+            ),
+        }
+
+    def get_metrics(self) -> ResultDict:
+        # Compute per-episode metrics (only on already completed episodes).
+        for eps in self._done_episodes_for_metrics:
+            assert eps.is_done
+
+            episode_length = len(eps)
+            episode_return = eps.get_return()
+            episode_duration_s = eps.get_duration_s()
+
+            # Don't forget about the already returned chunks of this episode.
+            if eps.id_ in self._ongoing_episodes_for_metrics:
+                for eps2 in self._ongoing_episodes_for_metrics[eps.id_]:
+                    episode_length += len(eps2)
+                    episode_return += eps2.get_return()
+                del self._ongoing_episodes_for_metrics[eps.id_]
+
+            self._log_episode_metrics(
+                episode_length, episode_return, episode_duration_s
+            )
+
+        # Log num episodes counter for this iteration.
+        self.metrics.log_value(
+            NUM_EPISODES,
+            len(self._done_episodes_for_metrics),
+            reduce="sum",
+            # Reset internal data on `reduce()` call below (not a lifetime count).
+            clear_on_reduce=True,
+        )
+
+        # Now that we have logged everything, clear cache of done episodes.
+        self._done_episodes_for_metrics.clear()
+
+        # Return reduced metrics.
+        return self.metrics.reduce()
+
+    def get_state(
+        self,
+        components: Optional[Union[str, Collection[str]]] = None,
+        *,
+        not_components: Optional[Union[str, Collection[str]]] = None,
+        **kwargs,
+    ) -> StateDict:
+        """Returns the weights of our (single-agent) RLModule."""
+        if self.module is None:
+            assert self.config.share_module_between_env_runner_and_learner
+            return {}
+        else:
+            return {
+                COMPONENT_RL_MODULE: {
+                    DEFAULT_MODULE_ID: self.module.get_state(**kwargs),
+                },
+            }
+
+    def set_state(self, state: StateDict) -> None:
+        """Writes the weights of our (single-agent) RLModule."""
+        if self.module is None:
+            assert self.config.share_module_between_env_runner_and_learner
+        else:
+            self.module.set_state(state[COMPONENT_RL_MODULE][DEFAULT_MODULE_ID])
+
+    @override(EnvRunner)
+    def assert_healthy(self):
+        # Make sure, we have built our gym.vector.Env and RLModule properly.
+        assert self.env and self.module
+
+    @override(EnvRunner)
+    def stop(self):
+        # Close our env object via gymnasium's API.
+        self.env.close()
+
+    def _reset_envs(self, episodes, initial_states):
+        # Create n new episodes and make the `on_episode_created` callbacks.
+        for env_index in range(self.num_envs):
+            self._new_episode(env_index, episodes)
+
+        # Erase all cached ongoing episodes (these will never be completed and
+        # would thus never be returned/cleaned by `get_metrics` and cause a memory
+        # leak).
+        self._ongoing_episodes_for_metrics.clear()
+
+        observations, infos = self.env.reset()
+        observations = unbatch(observations)
+
+        # Set initial obs and infos in the episodes.
+        for env_index in range(self.num_envs):
+            episodes[env_index].add_env_reset(
+                observation=observations[env_index],
+                infos=infos[env_index],
+            )
+
+        # Run the env-to-module connector to make sure the reset-obs/infos have
+        # properly been processed (if applicable).
+        self._cached_to_module = None
+        if self.module:
+            is_first = np.zeros((self.num_envs,))
+            for i, eps in enumerate(self._episodes):
+                if self._states[i] is None:
+                    is_first[i] = 1.0
+                    self._states[i] = {k: s[i] for k, s in initial_states.items()}
+            self._cached_to_module = {
+                Columns.STATE_IN: tree.map_structure(
+                    lambda s: self.convert_to_tensor(s), batch(self._states)
+                ),
+                Columns.OBS: self.convert_to_tensor(observations),
+                "is_first": self.convert_to_tensor(is_first),
+            }
+            # self._cached_to_module = TODO!!
+
+    def _new_episode(self, env_index, episodes=None):
+        episodes = episodes if episodes is not None else self._episodes
+        episodes[env_index] = SingleAgentEpisode(
+            observation_space=self.env.single_observation_space,
+            action_space=self.env.single_action_space,
+        )
+
+    def _increase_sampled_metrics(self, num_steps):
+        # Per sample cycle stats.
+        self.metrics.log_value(
+            NUM_ENV_STEPS_SAMPLED, num_steps, reduce="sum", clear_on_reduce=True
+        )
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_SAMPLED, DEFAULT_AGENT_ID),
+            num_steps,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        self.metrics.log_value(
+            (NUM_MODULE_STEPS_SAMPLED, DEFAULT_MODULE_ID),
+            num_steps,
+            reduce="sum",
+            clear_on_reduce=True,
+        )
+        # Lifetime stats.
+        self.metrics.log_value(NUM_ENV_STEPS_SAMPLED_LIFETIME, num_steps, reduce="sum")
+        self.metrics.log_value(
+            (NUM_AGENT_STEPS_SAMPLED_LIFETIME, DEFAULT_AGENT_ID),
+            num_steps,
+            reduce="sum",
+        )
+        self.metrics.log_value(
+            (NUM_MODULE_STEPS_SAMPLED_LIFETIME, DEFAULT_MODULE_ID),
+            num_steps,
+            reduce="sum",
+        )
+        return num_steps
+
+    def _log_episode_metrics(self, length, ret, sec):
+        # Log general episode metrics.
+        # To mimick the old API stack behavior, we'll use `window` here for
+        # these particular stats (instead of the default EMA).
+        win = self.config.metrics_num_episodes_for_smoothing
+        self.metrics.log_value(EPISODE_LEN_MEAN, length, window=win)
+        self.metrics.log_value(EPISODE_RETURN_MEAN, ret, window=win)
+        self.metrics.log_value(EPISODE_DURATION_SEC_MEAN, sec, window=win)
+
+        # For some metrics, log min/max as well.
+        self.metrics.log_value(EPISODE_LEN_MIN, length, reduce="min")
+        self.metrics.log_value(EPISODE_RETURN_MIN, ret, reduce="min")
+        self.metrics.log_value(EPISODE_LEN_MAX, length, reduce="max")
+        self.metrics.log_value(EPISODE_RETURN_MAX, ret, reduce="max")
+
+    @Deprecated(
+        new="DreamerV3EnvRunner.get_state(components='rl_module')",
+        error=True,
+    )
+    def get_weights(self, *args, **kwargs):
+        pass
+
+    @Deprecated(
+        new="DreamerV3EnvRunner.get_state()",
+        error=True,
+    )
+    def set_weights(self, *args, **kwargs):
+        pass
+
+
+class NormalizedImageEnv(gym.ObservationWrapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.observation_space = gym.spaces.Box(
+            -1.0,
+            1.0,
+            shape=self.observation_space.shape,
+            dtype=np.float32,
+        )
+
+    # Divide by scale and center around 0.0, such that observations are in the range
+    # of -1.0 and 1.0.
+    def observation(self, observation):
+        return (observation.astype(np.float32) / 128.0) - 1.0
+
+
+class OneHot(gym.ObservationWrapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.observation_space = gym.spaces.Box(
+            0.0, 1.0, shape=(self.observation_space.n,), dtype=np.float32
+        )
+
+    def reset(self, **kwargs):
+        ret = self.env.reset(**kwargs)
+        return self._get_obs(ret[0]), ret[1]
+
+    def step(self, action):
+        ret = self.env.step(action)
+        return self._get_obs(ret[0]), ret[1], ret[2], ret[3], ret[4]
+
+    def _get_obs(self, obs):
+        return one_hot(obs, depth=self.observation_space.shape[0])
+
+
+class ActionClip(gym.ActionWrapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._low = -1.0
+        self._high = 1.0
+        self.action_space = gym.spaces.Box(
+            self._low,
+            self._high,
+            self.action_space.shape,
+            self.action_space.dtype,
+        )
+
+    def action(self, action):
+        return np.clip(action, self._low, self._high)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/summaries.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/summaries.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b0ea753d4d2c078a2bfd14d80590b5161e2469
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/summaries.py
@@ -0,0 +1,408 @@
+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+import numpy as np
+
+from ray.rllib.algorithms.dreamerv3.utils.debugging import (
+    create_cartpole_dream_image,
+    create_frozenlake_dream_image,
+)
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.metrics import (
+    LEARNER_RESULTS,
+    REPLAY_BUFFER_RESULTS,
+)
+from ray.rllib.utils.tf_utils import inverse_symlog
+
+torch, _ = try_import_torch()
+
+
+def reconstruct_obs_from_h_and_z(
+    h_t0_to_H,
+    z_t0_to_H,
+    dreamer_model,
+    obs_dims_shape,
+    framework="torch",
+):
+    """Returns"""
+    shape = h_t0_to_H.shape
+    T = shape[0]  # inputs are time-major
+    B = shape[1]
+    # Compute actual observations using h and z and the decoder net.
+    # Note that the last h-state (T+1) is NOT used here as it's already part of
+    # a new trajectory.
+    # Use mean() of the Gaussian, no sample! -> No need to construct dist object here.
+    if framework == "torch":
+        device = next(iter(dreamer_model.world_model.decoder.parameters())).device
+        reconstructed_obs_distr_means_TxB = (
+            dreamer_model.world_model.decoder(
+                # Fold time rank.
+                h=torch.from_numpy(h_t0_to_H).reshape((T * B, -1)).to(device),
+                z=torch.from_numpy(z_t0_to_H)
+                .reshape((T * B,) + z_t0_to_H.shape[2:])
+                .to(device),
+            )
+            .detach()
+            .cpu()
+            .numpy()
+        )
+    else:
+        reconstructed_obs_distr_means_TxB = dreamer_model.world_model.decoder(
+            # Fold time rank.
+            h=h_t0_to_H.reshape((T * B, -1)),
+            z=z_t0_to_H.reshape((T * B,) + z_t0_to_H.shape[2:]),
+        )
+
+    # Unfold time rank again.
+    reconstructed_obs_T_B = np.reshape(
+        reconstructed_obs_distr_means_TxB, (T, B) + obs_dims_shape
+    )
+    # Return inverse symlog'd (real env obs space) reconstructed observations.
+    return reconstructed_obs_T_B
+
+
+def report_dreamed_trajectory(
+    *,
+    results,
+    env,
+    dreamer_model,
+    obs_dims_shape,
+    batch_indices=(0,),
+    desc=None,
+    include_images=True,
+    framework="torch",
+):
+    if not include_images:
+        return
+
+    dream_data = results["dream_data"]
+    dreamed_obs_H_B = reconstruct_obs_from_h_and_z(
+        h_t0_to_H=dream_data["h_states_t0_to_H_BxT"],
+        z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"],
+        dreamer_model=dreamer_model,
+        obs_dims_shape=obs_dims_shape,
+        framework=framework,
+    )
+    func = (
+        create_cartpole_dream_image
+        if env.startswith("CartPole")
+        else create_frozenlake_dream_image
+    )
+    # Take 0th dreamed trajectory and produce series of images.
+    for b in batch_indices:
+        images = []
+        for t in range(len(dreamed_obs_H_B) - 1):
+            images.append(
+                func(
+                    dreamed_obs=dreamed_obs_H_B[t][b],
+                    dreamed_V=dream_data["values_dreamed_t0_to_H_BxT"][t][b],
+                    dreamed_a=(dream_data["actions_ints_dreamed_t0_to_H_BxT"][t][b]),
+                    dreamed_r_tp1=(dream_data["rewards_dreamed_t0_to_H_BxT"][t + 1][b]),
+                    # `DISAGREE_intrinsic_rewards_H_B` are shifted by 1 already
+                    # (from t1 to H, not t0 to H like all other data here).
+                    dreamed_ri_tp1=(
+                        results["DISAGREE_intrinsic_rewards_H_BxT"][t][b]
+                        if "DISAGREE_intrinsic_rewards_H_BxT" in results
+                        else None
+                    ),
+                    dreamed_c_tp1=(
+                        dream_data["continues_dreamed_t0_to_H_BxT"][t + 1][b]
+                    ),
+                    value_target=results["VALUE_TARGETS_H_BxT"][t][b],
+                    initial_h=dream_data["h_states_t0_to_H_BxT"][t][b],
+                    as_tensor=True,
+                ).numpy()
+            )
+        # Concat images along width-axis (so they show as a "film sequence" next to each
+        # other).
+        results.update(
+            {
+                f"dreamed_trajectories{('_'+desc) if desc else ''}_B{b}": (
+                    np.concatenate(images, axis=1)
+                ),
+            }
+        )
+
+
+def report_predicted_vs_sampled_obs(
+    *,
+    metrics,
+    sample,
+    batch_size_B,
+    batch_length_T,
+    symlog_obs: bool = True,
+    do_report: bool = True,
+):
+    """Summarizes sampled data (from the replay buffer) vs world-model predictions.
+
+    World model predictions are based on the posterior states (z computed from actual
+    observation encoder input + the current h-states).
+
+    Observations: Computes MSE (sampled vs predicted/recreated) over all features.
+    For image observations, also creates direct image comparisons (sampled images
+    vs predicted (posterior) ones).
+    Rewards: Compute MSE (sampled vs predicted).
+    Continues: Compute MSE (sampled vs predicted).
+
+    Args:
+        metrics: The MetricsLogger object of the DreamerV3 algo.
+        sample: The sampled data (dict) from the replay buffer. Already tf-tensor
+            converted.
+        batch_size_B: The batch size (B). This is the number of trajectories sampled
+            from the buffer.
+        batch_length_T: The batch length (T). This is the length of an individual
+            trajectory sampled from the buffer.
+        do_report: Whether to actually log the report (default). If this is set to
+            False, this function serves as a clean-up on the given metrics, making sure
+            they do NOT contain anymore any (spacious) data relevant for producing
+            the report/videos.
+    """
+    fwd_output_key = (
+        LEARNER_RESULTS,
+        DEFAULT_MODULE_ID,
+        "WORLD_MODEL_fwd_out_obs_distribution_means_b0xT",
+    )
+    # logged as a non-reduced item (still a list)
+    predicted_observation_means_single_example = metrics.peek(
+        fwd_output_key, default=[None]
+    )[-1]
+    metrics.delete(fwd_output_key, key_error=False)
+
+    final_result_key = (
+        f"WORLD_MODEL_sampled_vs_predicted_posterior_b0x{batch_length_T}_videos"
+    )
+    if not do_report:
+        metrics.delete(final_result_key, key_error=False)
+        return
+
+    _report_obs(
+        metrics=metrics,
+        computed_float_obs_B_T_dims=np.reshape(
+            predicted_observation_means_single_example,
+            # WandB videos need to be channels first.
+            (1, batch_length_T) + sample[Columns.OBS].shape[2:],
+        ),
+        sampled_obs_B_T_dims=sample[Columns.OBS][0:1],
+        metrics_key=final_result_key,
+        symlog_obs=symlog_obs,
+    )
+
+
+def report_dreamed_eval_trajectory_vs_samples(
+    *,
+    metrics,
+    sample,
+    burn_in_T,
+    dreamed_T,
+    dreamer_model,
+    symlog_obs: bool = True,
+    do_report: bool = True,
+    framework="torch",
+) -> None:
+    """Logs dreamed observations, rewards, continues and compares them vs sampled data.
+
+    For obs, we'll try to create videos (side-by-side comparison) of the dreamed,
+    recreated-from-prior obs vs the sampled ones (over dreamed_T timesteps).
+
+    Args:
+        metrics: The MetricsLogger object of the DreamerV3 algo.
+        sample: The sampled data (dict) from the replay buffer. Already tf-tensor
+            converted.
+        burn_in_T: The number of burn-in timesteps (these will be skipped over in the
+            reported video comparisons and MSEs).
+        dreamed_T: The number of timesteps to produce dreamed data for.
+        dreamer_model: The DreamerModel to use to create observation vectors/images
+            from dreamed h- and (prior) z-states.
+        symlog_obs: Whether to inverse-symlog the computed observations or not. Set this
+            to True for environments, in which we should symlog the observations.
+        do_report: Whether to actually log the report (default). If this is set to
+            False, this function serves as a clean-up on the given metrics, making sure
+            they do NOT contain anymore any (spacious) data relevant for producing
+            the report/videos.
+    """
+    dream_data = metrics.peek(
+        (LEARNER_RESULTS, DEFAULT_MODULE_ID, "dream_data"),
+        default={},
+    )
+    metrics.delete(LEARNER_RESULTS, DEFAULT_MODULE_ID, "dream_data", key_error=False)
+
+    final_result_key_obs = f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_obs"
+    final_result_key_rew = (
+        f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_rewards_MSE"
+    )
+    final_result_key_cont = (
+        f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_continues_MSE"
+    )
+    if not do_report:
+        metrics.delete(final_result_key_obs, key_error=False)
+        metrics.delete(final_result_key_rew, key_error=False)
+        metrics.delete(final_result_key_cont, key_error=False)
+        return
+
+    # Obs MSE.
+    dreamed_obs_H_B = reconstruct_obs_from_h_and_z(
+        h_t0_to_H=dream_data["h_states_t0_to_H_Bx1"][0],  # [0] b/c reduce=None (list)
+        z_t0_to_H=dream_data["z_states_prior_t0_to_H_Bx1"][0],
+        dreamer_model=dreamer_model,
+        obs_dims_shape=sample[Columns.OBS].shape[2:],
+        framework=framework,
+    )
+    t0 = burn_in_T
+    tH = t0 + dreamed_T
+    # Observation MSE and - if applicable - images comparisons.
+    _report_obs(
+        metrics=metrics,
+        # WandB videos need to be 5D (B, L, c, h, w) -> transpose/swap H and B axes.
+        computed_float_obs_B_T_dims=np.swapaxes(dreamed_obs_H_B, 0, 1)[
+            0:1
+        ],  # for now: only B=1
+        sampled_obs_B_T_dims=sample[Columns.OBS][0:1, t0:tH],
+        metrics_key=final_result_key_obs,
+        symlog_obs=symlog_obs,
+    )
+
+    # Reward MSE.
+    _report_rewards(
+        metrics=metrics,
+        computed_rewards=dream_data["rewards_dreamed_t0_to_H_Bx1"][0],
+        sampled_rewards=sample[Columns.REWARDS][:, t0:tH],
+        metrics_key=final_result_key_rew,
+    )
+
+    # Continues MSE.
+    _report_continues(
+        metrics=metrics,
+        computed_continues=dream_data["continues_dreamed_t0_to_H_Bx1"][0],
+        sampled_continues=(1.0 - sample["is_terminated"])[:, t0:tH],
+        metrics_key=final_result_key_cont,
+    )
+
+
+def report_sampling_and_replay_buffer(*, metrics, replay_buffer):
+    episodes_in_buffer = replay_buffer.get_num_episodes()
+    ts_in_buffer = replay_buffer.get_num_timesteps()
+    replayed_steps = replay_buffer.get_sampled_timesteps()
+    added_steps = replay_buffer.get_added_timesteps()
+
+    # Summarize buffer, sampling, and train ratio stats.
+    metrics.log_dict(
+        {
+            "capacity": replay_buffer.capacity,
+            "size_num_episodes": episodes_in_buffer,
+            "size_timesteps": ts_in_buffer,
+            "replayed_steps": replayed_steps,
+            "added_steps": added_steps,
+        },
+        key=REPLAY_BUFFER_RESULTS,
+        window=1,
+    )  # window=1 b/c these are current (total count/state) values.
+
+
+def _report_obs(
+    *,
+    metrics,
+    computed_float_obs_B_T_dims,
+    sampled_obs_B_T_dims,
+    metrics_key,
+    symlog_obs,
+):
+    """Summarizes computed- vs sampled observations: MSE and (if applicable) images.
+
+    Args:
+        metrics: The MetricsLogger object of the DreamerV3 algo.
+        computed_float_obs_B_T_dims: Computed float observations
+            (not clipped, not cast'd). Shape=(B, T, [dims ...]).
+        sampled_obs_B_T_dims: Sampled observations (as-is from the environment, meaning
+            this could be uint8, 0-255 clipped images). Shape=(B, T, [dims ...]).
+        metrics_key: The metrics key (or key sequence) under which to log ths resulting
+            video sequence.
+        symlog_obs: Whether to inverse-symlog the computed observations or not. Set this
+            to True for environments, in which we should symlog the observations.
+
+    """
+    # Videos: Create summary, comparing computed images with actual sampled ones.
+    # 4=[B, T, w, h] grayscale image; 5=[B, T, w, h, C] RGB image.
+    if len(sampled_obs_B_T_dims.shape) in [4, 5]:
+        # WandB videos need to be channels first.
+        transpose_axes = (
+            (0, 1, 4, 2, 3) if len(sampled_obs_B_T_dims.shape) == 5 else (0, 3, 1, 2)
+        )
+
+        if symlog_obs:
+            computed_float_obs_B_T_dims = inverse_symlog(computed_float_obs_B_T_dims)
+
+        # Restore image pixels from normalized (non-symlog'd) data.
+        if not symlog_obs:
+            computed_float_obs_B_T_dims = (computed_float_obs_B_T_dims + 1.0) * 128
+            sampled_obs_B_T_dims = (sampled_obs_B_T_dims + 1.0) * 128
+            sampled_obs_B_T_dims = np.clip(sampled_obs_B_T_dims, 0.0, 255.0).astype(
+                np.uint8
+            )
+            sampled_obs_B_T_dims = np.transpose(sampled_obs_B_T_dims, transpose_axes)
+        computed_images = np.clip(computed_float_obs_B_T_dims, 0.0, 255.0).astype(
+            np.uint8
+        )
+        computed_images = np.transpose(computed_images, transpose_axes)
+        # Concat sampled and computed images along the height axis (3) such that
+        # real images show below respective predicted ones.
+        # (B, T, C, h, w)
+        sampled_vs_computed_images = np.concatenate(
+            [computed_images, sampled_obs_B_T_dims],
+            axis=-1,  # concat on width axis (looks nicer)
+        )
+        # Add grayscale dim, if necessary.
+        if len(sampled_obs_B_T_dims.shape) == 2 + 2:
+            sampled_vs_computed_images = np.expand_dims(sampled_vs_computed_images, -1)
+
+        metrics.log_value(
+            metrics_key,
+            sampled_vs_computed_images,
+            reduce=None,  # No reduction, we want the obs tensor to stay in-tact.
+            window=1,
+        )
+
+
+def _report_rewards(
+    *,
+    metrics,
+    computed_rewards,
+    sampled_rewards,
+    metrics_key,
+):
+    mse_sampled_vs_computed_rewards = np.mean(
+        np.square(computed_rewards - sampled_rewards)
+    )
+    mse_sampled_vs_computed_rewards = np.mean(mse_sampled_vs_computed_rewards)
+    metrics.log_value(
+        metrics_key,
+        mse_sampled_vs_computed_rewards,
+        window=1,
+    )
+
+
+def _report_continues(
+    *,
+    metrics,
+    computed_continues,
+    sampled_continues,
+    metrics_key,
+):
+    # Continue MSE.
+    mse_sampled_vs_computed_continues = np.mean(
+        np.square(
+            computed_continues - sampled_continues.astype(computed_continues.dtype)
+        )
+    )
+    metrics.log_value(
+        metrics_key,
+        mse_sampled_vs_computed_continues,
+        window=1,
+    )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..913c1b77198e0ee941198fdfefbd8bc29ef875cf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__init__.py
@@ -0,0 +1,23 @@
+from ray.rllib.algorithms.impala.impala import (
+    IMPALA,
+    IMPALAConfig,
+    Impala,
+    ImpalaConfig,
+)
+from ray.rllib.algorithms.impala.impala_tf_policy import (
+    ImpalaTF1Policy,
+    ImpalaTF2Policy,
+)
+from ray.rllib.algorithms.impala.impala_torch_policy import ImpalaTorchPolicy
+
+__all__ = [
+    "IMPALA",
+    "IMPALAConfig",
+    # @OldAPIStack
+    "ImpalaTF1Policy",
+    "ImpalaTF2Policy",
+    "ImpalaTorchPolicy",
+    # Deprecated names (lowercase)
+    "ImpalaConfig",
+    "Impala",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c1676ec6dd734f8c0375b14bb15504e9536b743
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_learner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a69353656329c85d7524dd5cd68b5e30537f94d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_learner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_tf_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_tf_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0a70eace69986b103bdfe025adc200952a75cf1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_tf_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_torch_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_torch_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb00d80a8aa8864f2f25fafbc1dfe38e3653f229
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_torch_policy.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_tf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f42b72d297ff8c319eb74540c659864cf6f701e5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_tf.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad1436ac81ee75ca3776d671556aacdb5371c650
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala.py
@@ -0,0 +1,1362 @@
+import copy
+import functools
+import logging
+import queue
+import time
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
+
+import numpy as np
+import tree  # pip install dm_tree
+
+import ray
+from ray import ObjectRef
+from ray.rllib import SampleBatch
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate
+from ray.rllib.core import (
+    COMPONENT_ENV_TO_MODULE_CONNECTOR,
+    COMPONENT_MODULE_TO_ENV_CONNECTOR,
+)
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.execution.buffers.mixin_replay_buffer import MixInMultiAgentReplayBuffer
+from ray.rllib.execution.learner_thread import LearnerThread
+from ray.rllib.execution.multi_gpu_learner_thread import MultiGPULearnerThread
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import concat_samples
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning
+from ray.rllib.utils.metrics import (
+    ALL_MODULES,
+    ENV_RUNNER_RESULTS,
+    LEARNER_GROUP,
+    LEARNER_RESULTS,
+    LEARNER_UPDATE_TIMER,
+    MEAN_NUM_EPISODE_LISTS_RECEIVED,
+    MEAN_NUM_LEARNER_GROUP_RESULTS_RECEIVED,
+    MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_AGENT_STEPS_TRAINED,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    NUM_ENV_STEPS_TRAINED,
+    NUM_ENV_STEPS_TRAINED_LIFETIME,
+    NUM_MODULE_STEPS_TRAINED,
+    NUM_SYNCH_WORKER_WEIGHTS,
+    NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS,
+    SYNCH_WORKER_WEIGHTS_TIMER,
+    SAMPLE_TIMER,
+    TIMERS,
+)
+from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder
+from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ReplayMode
+from ray.rllib.utils.replay_buffers.replay_buffer import _ALL_POLICIES
+from ray.rllib.utils.schedules.scheduler import Scheduler
+from ray.rllib.utils.typing import (
+    LearningRateOrSchedule,
+    PartialAlgorithmConfigDict,
+    PolicyID,
+    ResultDict,
+    SampleBatchType,
+)
+from ray.tune.execution.placement_groups import PlacementGroupFactory
+
+
+logger = logging.getLogger(__name__)
+
+
+LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff"
+
+
+class IMPALAConfig(AlgorithmConfig):
+    """Defines a configuration class from which an Impala can be built.
+
+    .. testcode::
+
+        from ray.rllib.algorithms.impala import IMPALAConfig
+
+        config = (
+            IMPALAConfig()
+            .environment("CartPole-v1")
+            .env_runners(num_env_runners=1)
+            .training(lr=0.0003, train_batch_size_per_learner=512)
+            .learners(num_learners=1)
+        )
+        # Build a Algorithm object from the config and run 1 training iteration.
+        algo = config.build()
+        algo.train()
+        del algo
+
+    .. testcode::
+
+        from ray.rllib.algorithms.impala import IMPALAConfig
+        from ray import air
+        from ray import tune
+
+        config = (
+            IMPALAConfig()
+            .environment("CartPole-v1")
+            .env_runners(num_env_runners=1)
+            .training(lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0)
+            .learners(num_learners=1)
+        )
+        # Run with tune.
+        tune.Tuner(
+            "IMPALA",
+            param_space=config,
+            run_config=air.RunConfig(stop={"training_iteration": 1}),
+        ).fit()
+    """
+
+    def __init__(self, algo_class=None):
+        """Initializes a IMPALAConfig instance."""
+        self.exploration_config = {  # @OldAPIstack
+            # The Exploration class to use. In the simplest case, this is the name
+            # (str) of any class present in the `rllib.utils.exploration` package.
+            # You can also provide the python class directly or the full location
+            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
+            # EpsilonGreedy").
+            "type": "StochasticSampling",
+            # Add constructor kwargs here (if any).
+        }
+
+        super().__init__(algo_class=algo_class or IMPALA)
+
+        # fmt: off
+        # __sphinx_doc_begin__
+
+        # IMPALA specific settings:
+        self.vtrace = True
+        self.vtrace_clip_rho_threshold = 1.0
+        self.vtrace_clip_pg_rho_threshold = 1.0
+        self.learner_queue_size = 3
+        self.timeout_s_sampler_manager = 0.0
+        self.timeout_s_aggregator_manager = 0.0
+        self.broadcast_interval = 1
+        self.num_gpu_loader_threads = 8
+
+        self.grad_clip = 40.0
+        # Note: Only when using enable_rl_module_and_learner=True can the clipping mode
+        # be configured by the user. On the old API stack, RLlib will always clip by
+        # global_norm, no matter the value of `grad_clip_by`.
+        self.grad_clip_by = "global_norm"
+
+        self.vf_loss_coeff = 0.5
+        self.entropy_coeff = 0.01
+
+        # Override some of AlgorithmConfig's default values with IMPALA-specific values.
+        self.num_learners = 1
+        self.num_aggregator_actors_per_learner = 0
+        self.rollout_fragment_length = 50
+        self.train_batch_size = 500  # @OldAPIstack
+        self.num_env_runners = 2
+        self.lr = 0.0005
+        self.min_time_s_per_iteration = 10
+        # __sphinx_doc_end__
+        # fmt: on
+
+        # IMPALA takes care of its own EnvRunner (weights, connector, metrics) synching.
+        self._dont_auto_sync_env_runner_states = True
+
+        self.lr_schedule = None  # @OldAPIStack
+        self.entropy_coeff_schedule = None  # @OldAPIStack
+        self.num_multi_gpu_tower_stacks = 1  # @OldAPIstack
+        self.minibatch_buffer_size = 1  # @OldAPIstack
+        self.replay_proportion = 0.0  # @OldAPIstack
+        self.replay_buffer_num_slots = 0  # @OldAPIstack
+        self.learner_queue_timeout = 300  # @OldAPIstack
+        self.opt_type = "adam"  # @OldAPIstack
+        self.decay = 0.99  # @OldAPIstack
+        self.momentum = 0.0  # @OldAPIstack
+        self.epsilon = 0.1  # @OldAPIstack
+        self._separate_vf_optimizer = False  # @OldAPIstack
+        self._lr_vf = 0.0005  # @OldAPIstack
+        self.num_gpus = 1  # @OldAPIstack
+        self._tf_policy_handles_more_than_one_loss = True  # @OldAPIstack
+
+        # Deprecated settings.
+        self.num_aggregation_workers = DEPRECATED_VALUE
+        self.max_requests_in_flight_per_aggregator_worker = DEPRECATED_VALUE
+
+    @override(AlgorithmConfig)
+    def training(
+        self,
+        *,
+        vtrace: Optional[bool] = NotProvided,
+        vtrace_clip_rho_threshold: Optional[float] = NotProvided,
+        vtrace_clip_pg_rho_threshold: Optional[float] = NotProvided,
+        num_gpu_loader_threads: Optional[int] = NotProvided,
+        num_multi_gpu_tower_stacks: Optional[int] = NotProvided,
+        minibatch_buffer_size: Optional[int] = NotProvided,
+        replay_proportion: Optional[float] = NotProvided,
+        replay_buffer_num_slots: Optional[int] = NotProvided,
+        learner_queue_size: Optional[int] = NotProvided,
+        learner_queue_timeout: Optional[float] = NotProvided,
+        timeout_s_sampler_manager: Optional[float] = NotProvided,
+        timeout_s_aggregator_manager: Optional[float] = NotProvided,
+        broadcast_interval: Optional[int] = NotProvided,
+        grad_clip: Optional[float] = NotProvided,
+        opt_type: Optional[str] = NotProvided,
+        lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided,
+        decay: Optional[float] = NotProvided,
+        momentum: Optional[float] = NotProvided,
+        epsilon: Optional[float] = NotProvided,
+        vf_loss_coeff: Optional[float] = NotProvided,
+        entropy_coeff: Optional[LearningRateOrSchedule] = NotProvided,
+        entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided,
+        _separate_vf_optimizer: Optional[bool] = NotProvided,
+        _lr_vf: Optional[float] = NotProvided,
+        # Deprecated args.
+        num_aggregation_workers=DEPRECATED_VALUE,
+        max_requests_in_flight_per_aggregator_worker=DEPRECATED_VALUE,
+        **kwargs,
+    ) -> "IMPALAConfig":
+        """Sets the training related configuration.
+
+        Args:
+            vtrace: V-trace params (see vtrace_tf/torch.py).
+            vtrace_clip_rho_threshold:
+            vtrace_clip_pg_rho_threshold:
+            num_gpu_loader_threads: The number of GPU-loader threads (per Learner
+                worker), used to load incoming (CPU) batches to the GPU, if applicable.
+                The incoming batches are produced by each Learner's LearnerConnector
+                pipeline. After loading the batches on the GPU, the threads place them
+                on yet another queue for the Learner thread (only one per Learner
+                worker) to pick up and perform `forward_train/loss` computations.
+            num_multi_gpu_tower_stacks: For each stack of multi-GPU towers, how many
+                slots should we reserve for parallel data loading? Set this to >1 to
+                load data into GPUs in parallel. This will increase GPU memory usage
+                proportionally with the number of stacks.
+                Example:
+                2 GPUs and `num_multi_gpu_tower_stacks=3`:
+                - One tower stack consists of 2 GPUs, each with a copy of the
+                model/graph.
+                - Each of the stacks will create 3 slots for batch data on each of its
+                GPUs, increasing memory requirements on each GPU by 3x.
+                - This enables us to preload data into these stacks while another stack
+                is performing gradient calculations.
+            minibatch_buffer_size: How many train batches should be retained for
+                minibatching. This conf only has an effect if `num_epochs > 1`.
+            replay_proportion: Set >0 to enable experience replay. Saved samples will
+                be replayed with a p:1 proportion to new data samples.
+            replay_buffer_num_slots: Number of sample batches to store for replay.
+                The number of transitions saved total will be
+                (replay_buffer_num_slots * rollout_fragment_length).
+            learner_queue_size: Max queue size for train batches feeding into the
+                learner.
+            learner_queue_timeout: Wait for train batches to be available in minibatch
+                buffer queue this many seconds. This may need to be increased e.g. when
+                training with a slow environment.
+            timeout_s_sampler_manager: The timeout for waiting for sampling results
+                for workers -- typically if this is too low, the manager won't be able
+                to retrieve ready sampling results.
+            timeout_s_aggregator_manager: The timeout for waiting for replay worker
+                results -- typically if this is too low, the manager won't be able to
+                retrieve ready replay requests.
+            broadcast_interval: Number of training step calls before weights are
+                broadcasted to rollout workers that are sampled during any iteration.
+            grad_clip: If specified, clip the global norm of gradients by this amount.
+            opt_type: Either "adam" or "rmsprop".
+            lr_schedule: Learning rate schedule. In the format of
+                [[timestep, lr-value], [timestep, lr-value], ...]
+                Intermediary timesteps will be assigned to interpolated learning rate
+                values. A schedule should normally start from timestep 0.
+            decay: Decay setting for the RMSProp optimizer, in case `opt_type=rmsprop`.
+            momentum: Momentum setting for the RMSProp optimizer, in case
+                `opt_type=rmsprop`.
+            epsilon: Epsilon setting for the RMSProp optimizer, in case
+                `opt_type=rmsprop`.
+            vf_loss_coeff: Coefficient for the value function term in the loss function.
+            entropy_coeff: Coefficient for the entropy regularizer term in the loss
+                function.
+            entropy_coeff_schedule: Decay schedule for the entropy regularizer.
+            _separate_vf_optimizer: Set this to true to have two separate optimizers
+                optimize the policy-and value networks. Only supported for some
+                algorithms (APPO, IMPALA) on the old API stack.
+            _lr_vf: If _separate_vf_optimizer is True, define separate learning rate
+                for the value network.
+
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        if num_aggregation_workers != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="config.training(num_aggregation_workers=..)",
+                help="Aggregator workers are no longer supported on the old API "
+                "stack! To use aggregation (and GPU pre-loading) on the new API "
+                "stack, activate the new API stack, then set "
+                "`config.learners(num_aggregator_actors_per_learner=..)`. Good "
+                "choices are normally 1 or 2, but this depends on your overall "
+                "setup, especially your `EnvRunner` throughput.",
+                error=True,
+            )
+        if max_requests_in_flight_per_aggregator_worker != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="config.training(max_requests_in_flight_per_aggregator_worker=..)",
+                help="Aggregator workers are no longer supported on the old API "
+                "stack! To use aggregation (and GPU pre-loading) on the new API "
+                "stack, activate the new API stack and THEN set "
+                "`config.learners(max_requests_in_flight_per_aggregator_actor=..)"
+                "`.",
+                error=True,
+            )
+
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+
+        if vtrace is not NotProvided:
+            self.vtrace = vtrace
+        if vtrace_clip_rho_threshold is not NotProvided:
+            self.vtrace_clip_rho_threshold = vtrace_clip_rho_threshold
+        if vtrace_clip_pg_rho_threshold is not NotProvided:
+            self.vtrace_clip_pg_rho_threshold = vtrace_clip_pg_rho_threshold
+        if num_gpu_loader_threads is not NotProvided:
+            self.num_gpu_loader_threads = num_gpu_loader_threads
+        if num_multi_gpu_tower_stacks is not NotProvided:
+            self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks
+        if minibatch_buffer_size is not NotProvided:
+            self.minibatch_buffer_size = minibatch_buffer_size
+        if replay_proportion is not NotProvided:
+            self.replay_proportion = replay_proportion
+        if replay_buffer_num_slots is not NotProvided:
+            self.replay_buffer_num_slots = replay_buffer_num_slots
+        if learner_queue_size is not NotProvided:
+            self.learner_queue_size = learner_queue_size
+        if learner_queue_timeout is not NotProvided:
+            self.learner_queue_timeout = learner_queue_timeout
+        if broadcast_interval is not NotProvided:
+            self.broadcast_interval = broadcast_interval
+        if timeout_s_sampler_manager is not NotProvided:
+            self.timeout_s_sampler_manager = timeout_s_sampler_manager
+        if timeout_s_aggregator_manager is not NotProvided:
+            self.timeout_s_aggregator_manager = timeout_s_aggregator_manager
+        if grad_clip is not NotProvided:
+            self.grad_clip = grad_clip
+        if opt_type is not NotProvided:
+            self.opt_type = opt_type
+        if lr_schedule is not NotProvided:
+            self.lr_schedule = lr_schedule
+        if decay is not NotProvided:
+            self.decay = decay
+        if momentum is not NotProvided:
+            self.momentum = momentum
+        if epsilon is not NotProvided:
+            self.epsilon = epsilon
+        if vf_loss_coeff is not NotProvided:
+            self.vf_loss_coeff = vf_loss_coeff
+        if entropy_coeff is not NotProvided:
+            self.entropy_coeff = entropy_coeff
+        if entropy_coeff_schedule is not NotProvided:
+            self.entropy_coeff_schedule = entropy_coeff_schedule
+        if _separate_vf_optimizer is not NotProvided:
+            self._separate_vf_optimizer = _separate_vf_optimizer
+        if _lr_vf is not NotProvided:
+            self._lr_vf = _lr_vf
+
+        return self
+
+    @override(AlgorithmConfig)
+    def validate(self) -> None:
+        # Call the super class' validation method first.
+        super().validate()
+
+        # IMPALA and APPO need vtrace (A3C Policies no longer exist).
+        if not self.vtrace:
+            self._value_error(
+                "IMPALA and APPO do NOT support vtrace=False anymore! Set "
+                "`config.training(vtrace=True)`."
+            )
+
+        # New API stack checks.
+        if self.enable_env_runner_and_connector_v2:
+            # Does NOT support aggregation workers yet or a mixin replay buffer.
+            if self.replay_ratio != 0.0:
+                self._value_error(
+                    "The new API stack in combination with the new EnvRunner API "
+                    "does NOT support a mixin replay buffer yet for "
+                    f"{self} (set `config.replay_proportion` to 0.0)!"
+                )
+            # `lr_schedule` checking.
+            if self.lr_schedule is not None:
+                self._value_error(
+                    "`lr_schedule` is deprecated and must be None! Use the "
+                    "`lr` setting to setup a schedule."
+                )
+            # Entropy coeff schedule checking.
+            if self.entropy_coeff_schedule is not None:
+                self._value_error(
+                    "`entropy_coeff_schedule` is deprecated and must be None! Use the "
+                    "`entropy_coeff` setting to setup a schedule."
+                )
+            Scheduler.validate(
+                fixed_value_or_schedule=self.entropy_coeff,
+                setting_name="entropy_coeff",
+                description="entropy coefficient",
+            )
+            # Learner API specific checks.
+            # GPU-bound single Learner must be local (faster than remote Learner,
+            # b/c GPU can update in parallel through the learner thread).
+            if self.num_gpus_per_learner > 0 and self.num_learners == 1:
+                self._value_error(
+                    "When running with 1 GPU Learner, this Learner should be local! "
+                    "Set `config.learners(num_learners=0)` to configure a local "
+                    "Learner instance."
+                )
+            # CPU-bound single Learner must be remote (faster than local Learner,
+            # b/c learner thread would compete with main thread for resources).
+            elif self.num_gpus_per_learner == 0 and self.num_learners == 0:
+                self._value_error(
+                    "When running with a CPU Learner, this Learner should be remote! "
+                    "Set `config.learners(num_learners=1)` to configure a single "
+                    "remote Learner instance."
+                )
+
+            if self.minibatch_size is not None and not (
+                (self.minibatch_size % self.rollout_fragment_length == 0)
+                and self.minibatch_size <= self.total_train_batch_size
+            ):
+                self._value_error(
+                    f"`minibatch_size` ({self._minibatch_size}) must either be None "
+                    "or a multiple of `rollout_fragment_length` "
+                    f"({self.rollout_fragment_length}) while at the same time smaller "
+                    "than or equal to `total_train_batch_size` "
+                    f"({self.total_train_batch_size})!"
+                )
+        # Old API stack checks.
+        else:
+            if isinstance(self.entropy_coeff, float) and self.entropy_coeff < 0.0:
+                self._value_error("`entropy_coeff` must be >= 0.0")
+
+        # If two separate optimizers/loss terms used for tf, must also set
+        # `_tf_policy_handles_more_than_one_loss` to True.
+        if (
+            self.framework_str in ["tf", "tf2"]
+            and self._separate_vf_optimizer is True
+            and self._tf_policy_handles_more_than_one_loss is False
+        ):
+            self._value_error(
+                "`_tf_policy_handles_more_than_one_loss` must be set to True, for "
+                "TFPolicy to support more than one loss term/optimizer! Try setting "
+                "config.training(_tf_policy_handles_more_than_one_loss=True)."
+            )
+
+    @property
+    def replay_ratio(self) -> float:
+        """Returns replay ratio (between 0.0 and 1.0) based off self.replay_proportion.
+
+        Formula: ratio = 1 / proportion
+        """
+        return (1 / self.replay_proportion) if self.replay_proportion > 0 else 0.0
+
+    @override(AlgorithmConfig)
+    def get_default_learner_class(self):
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.impala.torch.impala_torch_learner import (
+                IMPALATorchLearner,
+            )
+
+            return IMPALATorchLearner
+        elif self.framework_str in ["tf2", "tf"]:
+            raise ValueError(
+                "TensorFlow is no longer supported on the new API stack! "
+                "Use `framework='torch'`."
+            )
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use `framework='torch'`."
+            )
+
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self) -> RLModuleSpec:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import (
+                DefaultPPOTorchRLModule,
+            )
+
+            return RLModuleSpec(module_class=DefaultPPOTorchRLModule)
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use either 'torch' or 'tf2'."
+            )
+
+    @override(AlgorithmConfig)
+    def build_learner_connector(
+        self,
+        input_observation_space,
+        input_action_space,
+        device=None,
+    ):
+        connector = super().build_learner_connector(
+            input_observation_space,
+            input_action_space,
+            device,
+        )
+        # Extend all episodes by one artificial timestep to allow the value function net
+        # to compute the bootstrap values (and add a mask to the batch to know, which
+        # slots to mask out).
+        if self.add_default_connectors_to_learner_pipeline:
+            connector.prepend(AddOneTsToEpisodesAndTruncate())
+        return connector
+
+
+ImpalaConfig = IMPALAConfig
+
+
+class IMPALA(Algorithm):
+    """Importance weighted actor/learner architecture (IMPALA) Algorithm
+
+    == Overview of data flow in IMPALA ==
+    1. Policy evaluation in parallel across `num_env_runners` actors produces
+       batches of size `rollout_fragment_length * num_envs_per_env_runner`.
+    2. If enabled, the replay buffer stores and produces batches of size
+       `rollout_fragment_length * num_envs_per_env_runner`.
+    3. If enabled, the minibatch ring buffer stores and replays batches of
+       size `train_batch_size` up to `num_epochs` times per batch.
+    4. The learner thread executes data parallel SGD across `num_gpus` GPUs
+       on batches of size `train_batch_size`.
+    """
+
+    @classmethod
+    @override(Algorithm)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return IMPALAConfig()
+
+    @classmethod
+    @override(Algorithm)
+    def get_default_policy_class(
+        cls, config: AlgorithmConfig
+    ) -> Optional[Type[Policy]]:
+        if config.framework_str == "torch":
+            from ray.rllib.algorithms.impala.impala_torch_policy import (
+                ImpalaTorchPolicy,
+            )
+
+            return ImpalaTorchPolicy
+
+        elif config.framework_str == "tf":
+            from ray.rllib.algorithms.impala.impala_tf_policy import (
+                ImpalaTF1Policy,
+            )
+
+            return ImpalaTF1Policy
+        else:
+            from ray.rllib.algorithms.impala.impala_tf_policy import (
+                ImpalaTF2Policy,
+            )
+
+            return ImpalaTF2Policy
+
+    @override(Algorithm)
+    def setup(self, config: AlgorithmConfig):
+        super().setup(config)
+
+        # Queue of data to be sent to the Learner.
+        self.data_to_place_on_learner = []
+        self._batch_being_built = []  # @OldAPIStack
+
+        # Create extra aggregation workers and assign each rollout worker to
+        # one of them.
+        self._episode_packs_being_built = []
+        self._ma_batches_being_built: Dict[int, list] = {
+            i: [] for i in range(self.config.num_learners or 1)
+        }
+
+        # Create our local mixin buffer.
+        if not self.config.enable_rl_module_and_learner:
+            self.local_mixin_buffer = MixInMultiAgentReplayBuffer(
+                capacity=(
+                    self.config.replay_buffer_num_slots
+                    if self.config.replay_buffer_num_slots > 0
+                    else 1
+                ),
+                replay_ratio=self.config.replay_ratio,
+                replay_mode=ReplayMode.LOCKSTEP,
+            )
+
+        # This variable is used to keep track of the statistics from the most recent
+        # update of the learner group
+        self._results = {}
+
+        if not self.config.enable_rl_module_and_learner:
+            # Create and start the learner thread.
+            self._learner_thread = make_learner_thread(self.env_runner, self.config)
+            self._learner_thread.start()
+
+    @override(Algorithm)
+    def training_step(self):
+        # Old API stack.
+        if not self.config.enable_rl_module_and_learner:
+            return self._training_step_old_api_stack()
+
+        do_async_updates = self.config.num_learners > 0
+
+        # Asynchronously request all EnvRunners to sample and return their current
+        # (e.g. ConnectorV2) states and sampling metrics/stats.
+        # Note that each item in `episode_refs` is a reference to a list of Episodes.
+        with self.metrics.log_time((TIMERS, SAMPLE_TIMER)):
+            (
+                episode_refs,
+                connector_states,
+                env_runner_metrics,
+                env_runner_indices_to_update,
+            ) = self._sample_and_get_connector_states()
+            # Reduce EnvRunner metrics over the n EnvRunners.
+            self.metrics.merge_and_log_n_dicts(
+                env_runner_metrics,
+                key=ENV_RUNNER_RESULTS,
+            )
+
+            # Log the average number of sample results (list of episodes) received.
+            self.metrics.log_value(MEAN_NUM_EPISODE_LISTS_RECEIVED, len(episode_refs))
+
+        time.sleep(0.01)
+
+        # "Batch" collected episode refs into groups, such that exactly
+        # `total_train_batch_size` timesteps are sent to
+        # `LearnerGroup.update_from_episodes()`.
+        if self.config.num_aggregator_actors_per_learner > 0:
+            data_packages_for_aggregators = self._pre_queue_episode_refs(
+                episode_refs, package_size=self.config.train_batch_size_per_learner
+            )
+            ma_batches_refs_remote_results = (
+                self._aggregator_actor_manager.fetch_ready_async_reqs(
+                    timeout_seconds=0.0,
+                    return_obj_refs=True,
+                    tags="batches",
+                )
+            )
+            ma_batches_refs = []
+            for call_result in ma_batches_refs_remote_results:
+                ma_batches_refs.append((call_result.actor_id, call_result.get()))
+
+            while data_packages_for_aggregators:
+
+                def _func(actor, p):
+                    return actor.get_batch(p)
+
+                num_agg = self.config.num_aggregator_actors_per_learner * (
+                    self.config.num_learners or 1
+                )
+                packs = data_packages_for_aggregators[:num_agg]
+                self._aggregator_actor_manager.foreach_actor_async(
+                    func=[functools.partial(_func, p=p) for p in packs],
+                    tag="batches",
+                )
+                data_packages_for_aggregators = data_packages_for_aggregators[num_agg:]
+
+            # Get n lists of m ObjRef[MABatch] (m=num_learners) to perform n calls to
+            # all learner workers with the already GPU-located batches.
+            data_packages_for_learner_group = self._pre_queue_batch_refs(
+                ma_batches_refs
+            )
+
+        else:
+            data_packages_for_learner_group = self._pre_queue_episode_refs(
+                episode_refs, package_size=self.config.total_train_batch_size
+            )
+
+        time.sleep(0.01)
+
+        # Call the LearnerGroup's `update_from_episodes` method.
+        with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
+            self.metrics.log_value(
+                key=MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED,
+                value=len(data_packages_for_learner_group),
+            )
+            rl_module_state = None
+            num_learner_group_results_received = 0
+
+            for batch_ref_or_episode_list_ref in data_packages_for_learner_group:
+                return_state = (
+                    self.metrics.peek(
+                        NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS,
+                        default=0,
+                    )
+                    >= self.config.broadcast_interval
+                )
+                timesteps = {
+                    NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek(
+                        (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME), default=0
+                    ),
+                    NUM_ENV_STEPS_TRAINED_LIFETIME: self.metrics.peek(
+                        (LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED_LIFETIME),
+                        default=0,
+                    ),
+                }
+                if self.config.num_aggregator_actors_per_learner > 0:
+                    learner_results = self.learner_group.update_from_batch(
+                        batch=batch_ref_or_episode_list_ref,
+                        async_update=do_async_updates,
+                        return_state=return_state,
+                        timesteps=timesteps,
+                        num_epochs=self.config.num_epochs,
+                        minibatch_size=self.config.minibatch_size,
+                        shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch,
+                    )
+                else:
+                    learner_results = self.learner_group.update_from_episodes(
+                        episodes=batch_ref_or_episode_list_ref,
+                        async_update=do_async_updates,
+                        return_state=return_state,
+                        timesteps=timesteps,
+                        num_epochs=self.config.num_epochs,
+                        minibatch_size=self.config.minibatch_size,
+                        shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch,
+                    )
+                # TODO (sven): Rename this metric into a more fitting name: ex.
+                #  `NUM_LEARNER_UPDATED_SINCE_LAST_WEIGHTS_SYNC`
+                self.metrics.log_value(
+                    NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS,
+                    1,
+                    reduce="sum",
+                )
+                if not do_async_updates:
+                    learner_results = [learner_results]
+
+                for results_from_n_learners in learner_results:
+                    if not results_from_n_learners[0]:
+                        continue
+                    num_learner_group_results_received += 1
+                    for r in results_from_n_learners:
+                        rl_module_state = r.pop(
+                            "_rl_module_state_after_update", rl_module_state
+                        )
+                    self.metrics.merge_and_log_n_dicts(
+                        stats_dicts=results_from_n_learners,
+                        key=LEARNER_RESULTS,
+                    )
+            self.metrics.log_value(
+                key=MEAN_NUM_LEARNER_GROUP_RESULTS_RECEIVED,
+                value=num_learner_group_results_received,
+            )
+
+        # Update LearnerGroup's own stats.
+        self.metrics.log_dict(self.learner_group.get_stats(), key=LEARNER_GROUP)
+
+        time.sleep(0.01)
+
+        # Figure out, whether we should sync/broadcast the (remote) EnvRunner states.
+        # Note: `learner_results` is a List of n (num async calls) Lists of m
+        # (num Learner workers) ResultDicts each.
+        if rl_module_state is not None:
+            self.metrics.set_value(
+                NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, 0
+            )
+            self.metrics.log_value(NUM_SYNCH_WORKER_WEIGHTS, 1, reduce="sum")
+            with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
+                self.env_runner_group.sync_env_runner_states(
+                    config=self.config,
+                    connector_states=connector_states,
+                    rl_module_state=rl_module_state,
+                )
+
+        time.sleep(0.01)
+
+    def _sample_and_get_connector_states(self):
+        def _remote_sample_get_state_and_metrics(_worker):
+            _episodes = _worker.sample()
+            # Get the EnvRunner's connector states.
+            _connector_states = _worker.get_state(
+                components=[
+                    COMPONENT_ENV_TO_MODULE_CONNECTOR,
+                    COMPONENT_MODULE_TO_ENV_CONNECTOR,
+                ]
+            )
+            _metrics = _worker.get_metrics()
+            # Return episode lists by reference so we don't have to send them to the
+            # main algo process, but to the Learner workers directly.
+            return ray.put(_episodes), _connector_states, _metrics
+
+        env_runner_indices_to_update = set()
+        episode_refs = []
+        connector_states = []
+        env_runner_metrics = []
+        num_healthy_remote_workers = self.env_runner_group.num_healthy_remote_workers()
+
+        # Perform asynchronous sampling on all (healthy) remote rollout workers.
+        if num_healthy_remote_workers > 0:
+            async_results: List[
+                Tuple[int, ObjectRef]
+            ] = self.env_runner_group.fetch_ready_async_reqs(
+                timeout_seconds=self.config.timeout_s_sampler_manager,
+                return_obj_refs=False,
+            )
+            self.env_runner_group.foreach_env_runner_async(
+                _remote_sample_get_state_and_metrics
+            )
+            # Get results from the n different async calls and store those EnvRunner
+            # indices we should update.
+            results = []
+            for r in async_results:
+                env_runner_indices_to_update.add(r[0])
+                results.append(r[1])
+
+            for (episodes, states, metrics) in results:
+                episode_refs.append(episodes)
+                connector_states.append(states)
+                env_runner_metrics.append(metrics)
+        # Sample from the local EnvRunner.
+        else:
+            episodes = self.env_runner.sample()
+            env_runner_metrics = [self.env_runner.get_metrics()]
+            episode_refs = [ray.put(episodes)]
+            connector_states = [
+                self.env_runner.get_state(
+                    components=[
+                        COMPONENT_ENV_TO_MODULE_CONNECTOR,
+                        COMPONENT_MODULE_TO_ENV_CONNECTOR,
+                    ]
+                )
+            ]
+
+        return (
+            episode_refs,
+            connector_states,
+            env_runner_metrics,
+            env_runner_indices_to_update,
+        )
+
+    def _pre_queue_episode_refs(
+        self, episode_refs: List[ObjectRef], package_size: int
+    ) -> List[List[ObjectRef]]:
+        # Each element in this list is itself a list of ObjRef[Episodes].
+        # Each ObjRef was returned by one EnvRunner from a single sample() call.
+        episodes: List[List[ObjectRef]] = []
+
+        for ref in episode_refs:
+            self._episode_packs_being_built.append(ref)
+            if (
+                len(self._episode_packs_being_built)
+                * self.config.num_envs_per_env_runner
+                * self.config.get_rollout_fragment_length()
+                >= package_size
+            ):
+                episodes.append(self._episode_packs_being_built)
+                self._episode_packs_being_built = []
+
+        return episodes
+
+    def _pre_queue_batch_refs(
+        self, batch_refs: List[Tuple[int, ObjectRef]]
+    ) -> List[List[ObjectRef]]:
+        # `batch_refs` is a list of tuple(aggregator_actor_id, ObjRef[MABatch]).
+
+        # Each ObjRef[MABatch] was returned by one AggregatorActor from a single
+        # `get_batch()` call and the underlying MABatch is already located on a
+        # particular GPU (matching one particular Learner).
+        for agg_actor_id, ma_batch_ref in batch_refs:
+            learner_actor_id = self._aggregator_actor_to_learner[agg_actor_id]
+            self._ma_batches_being_built[learner_actor_id].append(ma_batch_ref)
+
+        # Construct a n-group of batches (n=num_learners) as long as we still have
+        # at least one batch per learner in our queue.
+        batch_refs_for_learner_group: List[List[ObjectRef]] = []
+        while all(
+            learner_list for learner_list in self._ma_batches_being_built.values()
+        ):
+            batch_refs_for_learner_group.append(
+                [
+                    learner_list.pop(0)
+                    for learner_list in self._ma_batches_being_built.values()
+                ]
+            )
+
+        return batch_refs_for_learner_group
+
+    @classmethod
+    @override(Algorithm)
+    def default_resource_request(
+        cls,
+        config: Union[AlgorithmConfig, PartialAlgorithmConfigDict],
+    ):
+        if isinstance(config, AlgorithmConfig):
+            cf: IMPALAConfig = config
+        else:
+            cf: IMPALAConfig = cls.get_default_config().update_from_dict(config)
+
+        eval_config = cf.get_evaluation_config_object()
+
+        bundles = []
+
+        # Main process (old API stack).
+        if not cf.enable_rl_module_and_learner:
+            bundles.append(
+                {
+                    "CPU": cf.num_cpus_for_main_process,
+                    "GPU": 0 if cf._fake_gpus else cf.num_gpus,
+                }
+            )
+        # Main process (no local learner).
+        elif cf.num_learners > 0:
+            bundles.append({"CPU": cf.num_cpus_for_main_process})
+        # Main process (local learner).
+        else:
+            bundles.append(
+                {
+                    "CPU": max(
+                        cf.num_cpus_for_main_process,
+                        cf.num_cpus_per_learner if cf.num_gpus_per_learner == 0 else 0,
+                    ),
+                    "GPU": max(
+                        0,
+                        cf.num_gpus_per_learner
+                        - 0.01 * cf.num_aggregator_actors_per_learner,
+                    ),
+                }
+            )
+            # Aggregation actors (for the local learner).
+            bundles += [
+                {"CPU": 1, "GPU": 0.01 if cf.num_gpus_per_learner > 0 else 0}
+                for _ in range(cf.num_aggregator_actors_per_learner)
+            ]
+
+        # EnvRunners.
+        bundles += [
+            {
+                "CPU": cf.num_cpus_per_env_runner,
+                "GPU": cf.num_gpus_per_env_runner,
+                **cf.custom_resources_per_env_runner,
+            }
+            for _ in range(cf.num_env_runners)
+        ]
+
+        # Evaluation (remote) workers.
+        bundles += (
+            [
+                {
+                    # Note: The local eval worker is located on the driver
+                    # CPU or not even created iff >0 eval workers.
+                    "CPU": eval_config.num_cpus_per_env_runner,
+                    "GPU": eval_config.num_gpus_per_env_runner,
+                    **eval_config.custom_resources_per_env_runner,
+                }
+                for _ in range(cf.evaluation_num_env_runners)
+            ]
+            if cf.evaluation_interval
+            else []
+        )
+        # TODO (avnishn): Remove this once we have a way to extend placement group
+        #  factories.
+        # Only if we have actual (remote) learner workers. In case of a local learner,
+        # the resource has already been taken care of above.
+        if cf.enable_rl_module_and_learner and cf.num_learners > 0:
+            bundles += cls._get_learner_bundles(cf)
+
+        # Return PlacementGroupFactory containing all needed resources
+        # (already properly defined as device bundles).
+        return PlacementGroupFactory(
+            bundles=bundles,
+            strategy=cf.placement_strategy,
+        )
+
+    @OldAPIStack
+    def _training_step_old_api_stack(self):
+        # First, check, whether our learner thread is still healthy.
+        if not self._learner_thread.is_alive():
+            raise RuntimeError("The learner thread died while training!")
+
+        # Get sampled SampleBatches from our workers (by ray references if we use
+        # tree-aggregation).
+        unprocessed_sample_batches = self._get_samples_from_workers_old_api_stack(
+            return_object_refs=False,
+        )
+        # Tag workers that actually produced ready sample batches this iteration.
+        # Those workers will have to get updated at the end of the iteration.
+        workers_that_need_updates = {
+            worker_id for worker_id, _ in unprocessed_sample_batches
+        }
+
+        # Resolve collected batches here on local process (using the mixin buffer).
+        batches = self._process_experiences_old_api_stack(unprocessed_sample_batches)
+
+        # Increase sampling counters now that we have the actual SampleBatches on
+        # the local process (and can measure their sizes).
+        for batch in batches:
+            self._counters[NUM_ENV_STEPS_SAMPLED] += batch.count
+            self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps()
+        # Concatenate single batches into batches of size `total_train_batch_size`.
+        self._concatenate_batches_and_pre_queue(batches)
+        # Move train batches (of size `total_train_batch_size`) onto learner queue.
+        self._place_processed_samples_on_learner_thread_queue()
+        # Extract most recent train results from learner thread.
+        train_results = self._process_trained_results()
+
+        # Sync worker weights (only those policies that were actually updated).
+        with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
+            pids = list(train_results.keys())
+            self._update_workers_old_api_stack(
+                workers_that_need_updates=workers_that_need_updates,
+                policy_ids=pids,
+            )
+
+        # With a training step done, try to bring any aggregators back to life
+        # if necessary.
+        # Aggregation workers are stateless, so we do not need to restore any
+        # state here.
+        if self._aggregator_actor_manager:
+            self._aggregator_actor_manager.probe_unhealthy_actors(
+                timeout_seconds=self.config.env_runner_health_probe_timeout_s,
+                mark_healthy=True,
+            )
+
+        return train_results
+
+    @OldAPIStack
+    def _get_samples_from_workers_old_api_stack(
+        self,
+        return_object_refs: Optional[bool] = False,
+    ) -> List[Tuple[int, Union[ObjectRef, SampleBatchType]]]:
+        """Get samples from rollout workers for training.
+
+        Args:
+            return_object_refs: If True, return ObjectRefs instead of the samples
+                directly. This is useful when using aggregator workers so that data
+                collected on rollout workers is directly de referenced on the aggregator
+                workers instead of first in the driver and then on the aggregator
+                workers.
+
+        Returns:
+            a list of tuples of (worker_index, sample batch or ObjectRef to a sample
+                batch)
+
+        """
+        with self._timers[SAMPLE_TIMER]:
+            # Sample from healthy remote workers by default. If there is no healthy
+            # worker (either because they have all died, or because there was none to
+            # begin) check if the local_worker exists. If the local worker has an
+            # env_instance (either because there are no remote workers or
+            # self.config.create_env_on_local_worker == True), then sample from the
+            # local worker. Otherwise just return an empty list.
+            if self.env_runner_group.num_healthy_remote_workers() > 0:
+                # Perform asynchronous sampling on all (remote) rollout workers.
+                self.env_runner_group.foreach_env_runner_async(
+                    lambda worker: worker.sample()
+                )
+                sample_batches: List[
+                    Tuple[int, ObjectRef]
+                ] = self.env_runner_group.fetch_ready_async_reqs(
+                    timeout_seconds=self.config.timeout_s_sampler_manager,
+                    return_obj_refs=return_object_refs,
+                )
+            elif self.config.num_env_runners == 0 or (
+                self.env_runner and self.env_runner.async_env is not None
+            ):
+                # Sampling from the local worker
+                sample_batch = self.env_runner.sample()
+                if return_object_refs:
+                    sample_batch = ray.put(sample_batch)
+                sample_batches = [(0, sample_batch)]
+            else:
+                # Not much we can do. Return empty list and wait.
+                sample_batches = []
+
+        return sample_batches
+
+    @OldAPIStack
+    def _process_experiences_old_api_stack(
+        self,
+        worker_to_sample_batches: List[Tuple[int, SampleBatch]],
+    ) -> List[SampleBatchType]:
+        """Process sample batches directly on the driver, for training.
+
+        Args:
+            worker_to_sample_batches: List of (worker_id, sample_batch) tuples.
+
+        Returns:
+            Batches that have been processed by the mixin buffer.
+
+        """
+        batches = [b for _, b in worker_to_sample_batches]
+        processed_batches = []
+
+        for batch in batches:
+            assert not isinstance(
+                batch, ObjectRef
+            ), "`IMPALA._process_experiences_old_api_stack` can not handle ObjectRefs!"
+            batch = batch.decompress_if_needed()
+            # Only make a pass through the buffer, if replay proportion is > 0.0 (and
+            # we actually have one).
+            self.local_mixin_buffer.add(batch)
+            batch = self.local_mixin_buffer.replay(_ALL_POLICIES)
+            if batch:
+                processed_batches.append(batch)
+
+        return processed_batches
+
+    @OldAPIStack
+    def _concatenate_batches_and_pre_queue(self, batches: List[SampleBatch]) -> None:
+        """Concatenate batches that are being returned from rollout workers
+
+        Args:
+            batches: List of batches of experiences from EnvRunners.
+        """
+
+        def aggregate_into_larger_batch():
+            if (
+                sum(b.count for b in self._batch_being_built)
+                >= self.config.total_train_batch_size
+            ):
+                batch_to_add = concat_samples(self._batch_being_built)
+                self.data_to_place_on_learner.append(batch_to_add)
+                self._batch_being_built = []
+
+        for batch in batches:
+            # TODO (sven): Strange bug after a RolloutWorker crash and proper
+            #  restart. The bug is related to (old, non-V2) connectors being used and
+            #  seems to happen inside the AgentCollector's `add_action_reward_next_obs`
+            #  method, at the end of which the number of vf_preds (and all other
+            #  extra action outs) in the batch is one smaller than the number of obs/
+            #  actions/rewards, which then leads to a malformed train batch.
+            #  IMPALA/APPO crash inside the loss function (during v-trace operations)
+            #  b/c of the resulting shape mismatch. The following if-block prevents
+            #  this from happening and it can be removed once we are on the new API
+            #  stack for good (and use the new connectors and also no longer
+            #  AgentCollectors, RolloutWorkers, Policies, TrajectoryView API, etc..):
+            if (
+                self.config.batch_mode == "truncate_episodes"
+                and self.config.restart_failed_env_runners
+            ):
+                if any(
+                    SampleBatch.VF_PREDS in pb
+                    and (
+                        pb[SampleBatch.VF_PREDS].shape[0]
+                        != pb[SampleBatch.REWARDS].shape[0]
+                    )
+                    for pb in batch.policy_batches.values()
+                ):
+                    continue
+
+            self._batch_being_built.append(batch)
+            aggregate_into_larger_batch()
+
+    @OldAPIStack
+    def _learn_on_processed_samples(self) -> ResultDict:
+        """Update the learner group with the latest batch of processed samples.
+
+        Returns:
+            Aggregated results from the learner group after an update is completed.
+
+        """
+        # Nothing on the queue -> Don't send requests to learner group
+        # or no results ready (from previous `self.learner_group.update()` calls) for
+        # reducing.
+        if not self.data_to_place_on_learner:
+            return {}
+
+        # There are batches on the queue -> Send them all to the learner group.
+        batches = self.data_to_place_on_learner[:]
+        self.data_to_place_on_learner.clear()
+
+        # If there are no learner workers and learning is directly on the driver
+        # Then we can't do async updates, so we need to block.
+        async_update = self.config.num_learners > 0
+        results = []
+        for batch in batches:
+            results = self.learner_group.update_from_batch(
+                batch=batch,
+                timesteps={
+                    NUM_ENV_STEPS_SAMPLED_LIFETIME: (
+                        self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME)
+                    ),
+                },
+                async_update=async_update,
+                num_epochs=self.config.num_epochs,
+                minibatch_size=self.config.minibatch_size,
+            )
+            if not async_update:
+                results = [results]
+
+            for r in results:
+                self._counters[NUM_ENV_STEPS_TRAINED] += r[ALL_MODULES].pop(
+                    NUM_ENV_STEPS_TRAINED
+                )
+                self._counters[NUM_AGENT_STEPS_TRAINED] += r[ALL_MODULES].pop(
+                    NUM_MODULE_STEPS_TRAINED
+                )
+
+        self._counters.update(self.learner_group.get_stats())
+        # If there are results, reduce-mean over each individual value and return.
+        if results:
+            return tree.map_structure(lambda *x: np.mean(x), *results)
+
+        # Nothing on the queue -> Don't send requests to learner group
+        # or no results ready (from previous `self.learner_group.update_from_batch()`
+        # calls) for reducing.
+        return {}
+
+    @OldAPIStack
+    def _place_processed_samples_on_learner_thread_queue(self) -> None:
+        """Place processed samples on the learner queue for training."""
+        for i, batch in enumerate(self.data_to_place_on_learner):
+            try:
+                self._learner_thread.inqueue.put(
+                    batch,
+                    # Setting block = True for the very last item in our list prevents
+                    # the learner thread, this main thread, and the GPU loader threads
+                    # from thrashing when there are more samples than the learner can
+                    # reasonably process.
+                    # see https://github.com/ray-project/ray/pull/26581#issuecomment-1187877674  # noqa
+                    block=i == len(self.data_to_place_on_learner) - 1,
+                )
+                self._counters["num_samples_added_to_queue"] += (
+                    batch.agent_steps()
+                    if self.config.count_steps_by == "agent_steps"
+                    else batch.count
+                )
+            except queue.Full:
+                self._counters["num_times_learner_queue_full"] += 1
+
+        self.data_to_place_on_learner.clear()
+
+    @OldAPIStack
+    def _process_trained_results(self) -> ResultDict:
+        """Process training results that are outputed by the learner thread.
+
+        Returns:
+            Aggregated results from the learner thread after an update is completed.
+
+        """
+        # Get learner outputs/stats from output queue.
+        num_env_steps_trained = 0
+        num_agent_steps_trained = 0
+        learner_infos = []
+        # Loop through output queue and update our counts.
+        for _ in range(self._learner_thread.outqueue.qsize()):
+            (
+                env_steps,
+                agent_steps,
+                learner_results,
+            ) = self._learner_thread.outqueue.get(timeout=0.001)
+            num_env_steps_trained += env_steps
+            num_agent_steps_trained += agent_steps
+            if learner_results:
+                learner_infos.append(learner_results)
+        # Nothing new happened since last time, use the same learner stats.
+        if not learner_infos:
+            final_learner_info = copy.deepcopy(self._learner_thread.learner_info)
+        # Accumulate learner stats using the `LearnerInfoBuilder` utility.
+        else:
+            builder = LearnerInfoBuilder()
+            for info in learner_infos:
+                builder.add_learn_on_batch_results_multi_agent(info)
+            final_learner_info = builder.finalize()
+
+        # Update the steps trained counters.
+        self._counters[NUM_ENV_STEPS_TRAINED] += num_env_steps_trained
+        self._counters[NUM_AGENT_STEPS_TRAINED] += num_agent_steps_trained
+
+        return final_learner_info
+
+    @OldAPIStack
+    def _update_workers_old_api_stack(
+        self,
+        workers_that_need_updates: Set[int],
+        policy_ids: Optional[List[PolicyID]] = None,
+    ) -> None:
+        """Updates all RolloutWorkers that require updating.
+
+        Updates only if NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS has been
+        reached and the worker has sent samples in this iteration. Also only updates
+        those policies, whose IDs are given via `policies` (if None, update all
+        policies).
+
+        Args:
+            workers_that_need_updates: Set of worker IDs that need to be updated.
+            policy_ids: Optional list of Policy IDs to update. If None, will update all
+                policies on the to-be-updated workers.
+        """
+        # Update global vars of the local worker.
+        if self.config.policy_states_are_swappable:
+            self.env_runner.lock()
+        global_vars = {
+            "timestep": self._counters[NUM_AGENT_STEPS_TRAINED],
+            "num_grad_updates_per_policy": {
+                pid: self.env_runner.policy_map[pid].num_grad_updates
+                for pid in policy_ids or []
+            },
+        }
+        self.env_runner.set_global_vars(global_vars, policy_ids=policy_ids)
+        if self.config.policy_states_are_swappable:
+            self.env_runner.unlock()
+
+        # Only need to update workers if there are remote workers.
+        self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] += 1
+        if (
+            self.env_runner_group.num_remote_workers() > 0
+            and self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS]
+            >= self.config.broadcast_interval
+            and workers_that_need_updates
+        ):
+            if self.config.policy_states_are_swappable:
+                self.env_runner.lock()
+            weights = self.env_runner.get_weights(policy_ids)
+            if self.config.policy_states_are_swappable:
+                self.env_runner.unlock()
+            weights_ref = ray.put(weights)
+
+            self._learner_thread.policy_ids_updated.clear()
+            self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] = 0
+            self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1
+            self.env_runner_group.foreach_env_runner(
+                func=lambda w: w.set_weights(ray.get(weights_ref), global_vars),
+                local_env_runner=False,
+                remote_worker_ids=list(workers_that_need_updates),
+                timeout_seconds=0,  # Don't wait for the workers to finish.
+            )
+
+    @override(Algorithm)
+    def _compile_iteration_results_old_api_stack(self, *args, **kwargs):
+        result = super()._compile_iteration_results_old_api_stack(*args, **kwargs)
+        if not self.config.enable_rl_module_and_learner:
+            result = self._learner_thread.add_learner_metrics(
+                result, overwrite_learner_info=False
+            )
+        return result
+
+
+Impala = IMPALA
+
+
+@OldAPIStack
+def make_learner_thread(local_worker, config):
+    if not config["simple_optimizer"]:
+        logger.info(
+            "Enabling multi-GPU mode, {} GPUs, {} parallel tower-stacks".format(
+                config["num_gpus"], config["num_multi_gpu_tower_stacks"]
+            )
+        )
+        num_stacks = config["num_multi_gpu_tower_stacks"]
+        buffer_size = config["minibatch_buffer_size"]
+        if num_stacks < buffer_size:
+            logger.warning(
+                "In multi-GPU mode you should have at least as many "
+                "multi-GPU tower stacks (to load data into on one device) as "
+                "you have stack-index slots in the buffer! You have "
+                f"configured {num_stacks} stacks and a buffer of size "
+                f"{buffer_size}. Setting "
+                f"`minibatch_buffer_size={num_stacks}`."
+            )
+            config["minibatch_buffer_size"] = num_stacks
+
+        learner_thread = MultiGPULearnerThread(
+            local_worker,
+            num_gpus=config["num_gpus"],
+            lr=config["lr"],
+            train_batch_size=config["train_batch_size"],
+            num_multi_gpu_tower_stacks=config["num_multi_gpu_tower_stacks"],
+            num_sgd_iter=config["num_epochs"],
+            learner_queue_size=config["learner_queue_size"],
+            learner_queue_timeout=config["learner_queue_timeout"],
+            num_data_load_threads=config["num_gpu_loader_threads"],
+        )
+    else:
+        learner_thread = LearnerThread(
+            local_worker,
+            minibatch_buffer_size=config["minibatch_buffer_size"],
+            num_sgd_iter=config["num_epochs"],
+            learner_queue_size=config["learner_queue_size"],
+            learner_queue_timeout=config["learner_queue_timeout"],
+        )
+    return learner_thread
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e04636d003c9d92ed664f901bc9d692308aa33
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_learner.py
@@ -0,0 +1,211 @@
+from collections import deque
+import threading
+import time
+from typing import Any, Dict, Union
+
+import ray
+from ray.rllib.algorithms.appo.utils import CircularBuffer
+from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.rl_module.apis import ValueFunctionAPI
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict
+from ray.rllib.utils.metrics import (
+    ALL_MODULES,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
+from ray.rllib.utils.schedules.scheduler import Scheduler
+from ray.rllib.utils.typing import ModuleID, ResultDict
+
+torch, _ = try_import_torch()
+
+GPU_LOADER_QUEUE_WAIT_TIMER = "gpu_loader_queue_wait_timer"
+GPU_LOADER_LOAD_TO_GPU_TIMER = "gpu_loader_load_to_gpu_timer"
+LEARNER_THREAD_IN_QUEUE_WAIT_TIMER = "learner_thread_in_queue_wait_timer"
+LEARNER_THREAD_ENV_STEPS_DROPPED = "learner_thread_env_steps_dropped"
+LEARNER_THREAD_UPDATE_TIMER = "learner_thread_update_timer"
+RAY_GET_EPISODES_TIMER = "ray_get_episodes_timer"
+
+QUEUE_SIZE_GPU_LOADER_QUEUE = "queue_size_gpu_loader_queue"
+QUEUE_SIZE_LEARNER_THREAD_QUEUE = "queue_size_learner_thread_queue"
+QUEUE_SIZE_RESULTS_QUEUE = "queue_size_results_queue"
+
+_CURRENT_GLOBAL_TIMESTEPS = None
+
+
+class IMPALALearner(Learner):
+    @override(Learner)
+    def build(self) -> None:
+        super().build()
+
+        # TODO (sven): We replace the dummy RLock here for APPO/IMPALA, b/c these algos
+        #  require this for thread safety reasons.
+        #  An RLock breaks our current OfflineData and OfflinePreLearner logic, in which
+        #  the Learner (which contains a MetricsLogger) is serialized and deserialized.
+        #  We will have to fix this offline RL logic first, then can remove this hack
+        #  here and return to always using the RLock.
+        self.metrics._threading_lock = threading.RLock()
+
+        # Dict mapping module IDs to the respective entropy Scheduler instance.
+        self.entropy_coeff_schedulers_per_module: Dict[
+            ModuleID, Scheduler
+        ] = LambdaDefaultDict(
+            lambda module_id: Scheduler(
+                fixed_value_or_schedule=(
+                    self.config.get_config_for_module(module_id).entropy_coeff
+                ),
+                framework=self.framework,
+                device=self._device,
+            )
+        )
+
+        # Default is to have a learner thread.
+        if not hasattr(self, "_learner_thread_in_queue"):
+            self._learner_thread_in_queue = deque(maxlen=self.config.learner_queue_size)
+
+        # Create and start the Learner thread.
+        self._learner_thread = _LearnerThread(
+            update_method=self._update_from_batch_or_episodes,
+            in_queue=self._learner_thread_in_queue,
+            metrics_logger=self.metrics,
+        )
+        self._learner_thread.start()
+
+    @override(Learner)
+    def update_from_batch(
+        self,
+        batch: Any,
+        *,
+        timesteps: Dict[str, Any],
+        **kwargs,
+    ) -> ResultDict:
+        global _CURRENT_GLOBAL_TIMESTEPS
+        _CURRENT_GLOBAL_TIMESTEPS = timesteps or {}
+
+        if isinstance(batch, ray.ObjectRef):
+            batch = ray.get(batch)
+
+        self.before_gradient_based_update(timesteps=timesteps or {})
+
+        if isinstance(self._learner_thread_in_queue, CircularBuffer):
+            ts_dropped = self._learner_thread_in_queue.add(batch)
+            self.metrics.log_value(
+                (ALL_MODULES, LEARNER_THREAD_ENV_STEPS_DROPPED),
+                ts_dropped,
+                reduce="sum",
+            )
+        # Enqueue to Learner thread's in-queue.
+        else:
+            _LearnerThread.enqueue(self._learner_thread_in_queue, batch, self.metrics)
+
+        return self.metrics.reduce()
+
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    def before_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None:
+        super().before_gradient_based_update(timesteps=timesteps)
+
+        for module_id in self.module.keys():
+            # Update entropy coefficient via our Scheduler.
+            new_entropy_coeff = self.entropy_coeff_schedulers_per_module[
+                module_id
+            ].update(timestep=timesteps.get(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0))
+            self.metrics.log_value(
+                (module_id, LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY),
+                new_entropy_coeff,
+                window=1,
+            )
+
+    @override(Learner)
+    def remove_module(self, module_id: str):
+        super().remove_module(module_id)
+        self.entropy_coeff_schedulers_per_module.pop(module_id)
+
+    @classmethod
+    @override(Learner)
+    def rl_module_required_apis(cls) -> list[type]:
+        # In order for a PPOLearner to update an RLModule, it must implement the
+        # following APIs:
+        return [ValueFunctionAPI]
+
+
+ImpalaLearner = IMPALALearner
+
+
+class _LearnerThread(threading.Thread):
+    def __init__(
+        self,
+        *,
+        update_method,
+        in_queue: deque,
+        metrics_logger,
+    ):
+        super().__init__()
+        self.daemon = True
+        self.metrics: MetricsLogger = metrics_logger
+        self.stopped = False
+
+        self._update_method = update_method
+        self._in_queue: Union[deque, CircularBuffer] = in_queue
+
+    def run(self) -> None:
+        while not self.stopped:
+            self.step()
+
+    def step(self):
+        global _CURRENT_GLOBAL_TIMESTEPS
+
+        # Get a new batch from the GPU-data (deque.pop -> newest item first).
+        with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_IN_QUEUE_WAIT_TIMER)):
+            # Get a new batch from the GPU-data (learner queue OR circular buffer).
+            if isinstance(self._in_queue, CircularBuffer):
+                ma_batch_on_gpu = self._in_queue.sample()
+            else:
+                # Queue is empty: Sleep a tiny bit to avoid CPU-thrashing.
+                if not self._in_queue:
+                    time.sleep(0.001)
+                    return
+                # Consume from the left (oldest batches first).
+                # If we consumed from the right, we would run into the danger of
+                # learning from newer batches (left side) most times, BUT sometimes
+                # grabbing older batches (right area of deque).
+                ma_batch_on_gpu = self._in_queue.popleft()
+
+        # Call the update method on the batch.
+        with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_UPDATE_TIMER)):
+            # TODO (sven): For multi-agent AND SGD iter > 1, we need to make sure
+            #  this thread has the information about the min minibatches necessary
+            #  (due to different agents taking different steps in the env, e.g.
+            #  MA-CartPole).
+            self._update_method(
+                batch=ma_batch_on_gpu,
+                timesteps=_CURRENT_GLOBAL_TIMESTEPS,
+            )
+
+    @staticmethod
+    def enqueue(learner_queue: deque, batch, metrics):
+        # Right-append to learner queue (a deque). If full, drops the leftmost
+        # (oldest) item in the deque.
+        # Note that we consume from the left (oldest first), which is why the queue size
+        # should probably always be small'ish (<< 10), otherwise we run into the danger
+        # of training with very old samples.
+        # If we consumed from the right, we would run into the danger of learning
+        # from newer batches (left side) most times, BUT sometimes grabbing a
+        # really old batches (right area of deque).
+        if len(learner_queue) == learner_queue.maxlen:
+            metrics.log_value(
+                (ALL_MODULES, LEARNER_THREAD_ENV_STEPS_DROPPED),
+                learner_queue.popleft().env_steps(),
+                reduce="sum",
+            )
+        learner_queue.append(batch)
+
+        # Log current queue size.
+        metrics.log_value(
+            (ALL_MODULES, QUEUE_SIZE_LEARNER_THREAD_QUEUE),
+            len(learner_queue),
+        )
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_tf_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_tf_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d06d0065b124c7e45b3b69194d34b23c9eddc66f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_tf_policy.py
@@ -0,0 +1,443 @@
+"""Adapted from A3CTFPolicy to add V-trace.
+
+Keep in sync with changes to A3CTFPolicy and VtraceSurrogatePolicy."""
+
+import numpy as np
+import logging
+import gymnasium as gym
+from typing import Dict, List, Optional, Type, Union
+
+from ray.rllib.algorithms.impala import vtrace_tf as vtrace
+from ray.rllib.evaluation.postprocessing import compute_bootstrap_value
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_action_dist import Categorical, TFActionDistribution
+from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2
+from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.tf_mixins import LearningRateSchedule, EntropyCoeffSchedule
+from ray.rllib.utils import force_list
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.tf_utils import explained_variance
+from ray.rllib.policy.tf_mixins import GradStatsMixin, ValueNetworkMixin
+from ray.rllib.utils.typing import (
+    LocalOptimizer,
+    ModelGradients,
+    TensorType,
+    TFPolicyV2Type,
+)
+
+tf1, tf, tfv = try_import_tf()
+
+logger = logging.getLogger(__name__)
+
+
+class VTraceLoss:
+    def __init__(
+        self,
+        actions,
+        actions_logp,
+        actions_entropy,
+        dones,
+        behaviour_action_logp,
+        behaviour_logits,
+        target_logits,
+        discount,
+        rewards,
+        values,
+        bootstrap_value,
+        dist_class,
+        model,
+        valid_mask,
+        config,
+        vf_loss_coeff=0.5,
+        entropy_coeff=0.01,
+        clip_rho_threshold=1.0,
+        clip_pg_rho_threshold=1.0,
+    ):
+        """Policy gradient loss with vtrace importance weighting.
+
+        VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
+        batch_size. The reason we need to know `B` is for V-trace to properly
+        handle episode cut boundaries.
+
+        Args:
+            actions: An int|float32 tensor of shape [T, B, ACTION_SPACE].
+            actions_logp: A float32 tensor of shape [T, B].
+            actions_entropy: A float32 tensor of shape [T, B].
+            dones: A bool tensor of shape [T, B].
+            behaviour_action_logp: Tensor of shape [T, B].
+            behaviour_logits: A list with length of ACTION_SPACE of float32
+                tensors of shapes
+                [T, B, ACTION_SPACE[0]],
+                ...,
+                [T, B, ACTION_SPACE[-1]]
+            target_logits: A list with length of ACTION_SPACE of float32
+                tensors of shapes
+                [T, B, ACTION_SPACE[0]],
+                ...,
+                [T, B, ACTION_SPACE[-1]]
+            discount: A float32 scalar.
+            rewards: A float32 tensor of shape [T, B].
+            values: A float32 tensor of shape [T, B].
+            bootstrap_value: A float32 tensor of shape [B].
+            dist_class: action distribution class for logits.
+            valid_mask: A bool tensor of valid RNN input elements (#2992).
+            config: Algorithm config dict.
+        """
+
+        # Compute vtrace on the CPU for better performance.
+        with tf.device("/cpu:0"):
+            self.vtrace_returns = vtrace.multi_from_logits(
+                behaviour_action_log_probs=behaviour_action_logp,
+                behaviour_policy_logits=behaviour_logits,
+                target_policy_logits=target_logits,
+                actions=tf.unstack(actions, axis=2),
+                discounts=tf.cast(~tf.cast(dones, tf.bool), tf.float32) * discount,
+                rewards=rewards,
+                values=values,
+                bootstrap_value=bootstrap_value,
+                dist_class=dist_class,
+                model=model,
+                clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
+                clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32),
+            )
+            self.value_targets = self.vtrace_returns.vs
+
+        # The policy gradients loss.
+        masked_pi_loss = tf.boolean_mask(
+            actions_logp * self.vtrace_returns.pg_advantages, valid_mask
+        )
+        self.pi_loss = -tf.reduce_sum(masked_pi_loss)
+        self.mean_pi_loss = -tf.reduce_mean(masked_pi_loss)
+
+        # The baseline loss.
+        delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask)
+        delta_squarred = tf.math.square(delta)
+        self.vf_loss = 0.5 * tf.reduce_sum(delta_squarred)
+        self.mean_vf_loss = 0.5 * tf.reduce_mean(delta_squarred)
+
+        # The entropy loss.
+        masked_entropy = tf.boolean_mask(actions_entropy, valid_mask)
+        self.entropy = tf.reduce_sum(masked_entropy)
+        self.mean_entropy = tf.reduce_mean(masked_entropy)
+
+        # The summed weighted loss.
+        self.total_loss = self.pi_loss - self.entropy * entropy_coeff
+
+        # Optional vf loss (or in a separate term due to separate
+        # optimizers/networks).
+        self.loss_wo_vf = self.total_loss
+        if not config["_separate_vf_optimizer"]:
+            self.total_loss += self.vf_loss * vf_loss_coeff
+
+
+def _make_time_major(policy, seq_lens, tensor):
+    """Swaps batch and trajectory axis.
+
+    Args:
+        policy: Policy reference
+        seq_lens: Sequence lengths if recurrent or None
+        tensor: A tensor or list of tensors to reshape.
+        trajectory item.
+
+    Returns:
+        res: A tensor with swapped axes or a list of tensors with
+        swapped axes.
+    """
+    if isinstance(tensor, list):
+        return [_make_time_major(policy, seq_lens, t) for t in tensor]
+
+    if policy.is_recurrent():
+        B = tf.shape(seq_lens)[0]
+        T = tf.shape(tensor)[0] // B
+    else:
+        # Important: chop the tensor into batches at known episode cut
+        # boundaries.
+        # TODO: (sven) this is kind of a hack and won't work for
+        #  batch_mode=complete_episodes.
+        T = policy.config["rollout_fragment_length"]
+        B = tf.shape(tensor)[0] // T
+    rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))
+
+    # swap B and T axes
+    res = tf.transpose(rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))
+
+    return res
+
+
+class VTraceClipGradients:
+    """VTrace version of gradient computation logic."""
+
+    def __init__(self):
+        """No special initialization required."""
+        pass
+
+    def compute_gradients_fn(
+        self, optimizer: LocalOptimizer, loss: TensorType
+    ) -> ModelGradients:
+        # Supporting more than one loss/optimizer.
+        trainable_variables = self.model.trainable_variables()
+        if self.config["_tf_policy_handles_more_than_one_loss"]:
+            optimizers = force_list(optimizer)
+            losses = force_list(loss)
+            assert len(optimizers) == len(losses)
+            clipped_grads_and_vars = []
+            for optim, loss_ in zip(optimizers, losses):
+                grads_and_vars = optim.compute_gradients(loss_, trainable_variables)
+                clipped_g_and_v = []
+                for g, v in grads_and_vars:
+                    if g is not None:
+                        clipped_g, _ = tf.clip_by_global_norm(
+                            [g], self.config["grad_clip"]
+                        )
+                        clipped_g_and_v.append((clipped_g[0], v))
+                clipped_grads_and_vars.append(clipped_g_and_v)
+
+            self.grads = [g for g_and_v in clipped_grads_and_vars for (g, v) in g_and_v]
+        # Only one optimizer and and loss term.
+        else:
+            grads_and_vars = optimizer.compute_gradients(
+                loss, self.model.trainable_variables()
+            )
+            grads = [g for (g, v) in grads_and_vars]
+            self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
+            clipped_grads_and_vars = list(zip(self.grads, trainable_variables))
+
+        return clipped_grads_and_vars
+
+
+class VTraceOptimizer:
+    """Optimizer function for VTrace policies."""
+
+    def __init__(self):
+        pass
+
+    # TODO: maybe standardize this function, so the choice of optimizers are more
+    #  predictable for common algorithms.
+    def optimizer(
+        self,
+    ) -> Union["tf.keras.optimizers.Optimizer", List["tf.keras.optimizers.Optimizer"]]:
+        config = self.config
+        if config["opt_type"] == "adam":
+            if config["framework"] == "tf2":
+                optim = tf.keras.optimizers.Adam(self.cur_lr)
+                if config["_separate_vf_optimizer"]:
+                    return optim, tf.keras.optimizers.Adam(config["_lr_vf"])
+            else:
+                optim = tf1.train.AdamOptimizer(self.cur_lr)
+                if config["_separate_vf_optimizer"]:
+                    return optim, tf1.train.AdamOptimizer(config["_lr_vf"])
+        else:
+            if config["_separate_vf_optimizer"]:
+                raise ValueError(
+                    "RMSProp optimizer not supported for separate"
+                    "vf- and policy losses yet! Set `opt_type=adam`"
+                )
+
+            if tfv == 2:
+                optim = tf.keras.optimizers.RMSprop(
+                    self.cur_lr, config["decay"], config["momentum"], config["epsilon"]
+                )
+            else:
+                optim = tf1.train.RMSPropOptimizer(
+                    self.cur_lr, config["decay"], config["momentum"], config["epsilon"]
+                )
+
+        return optim
+
+
+# We need this builder function because we want to share the same
+# custom logics between TF1 dynamic and TF2 eager policies.
+def get_impala_tf_policy(name: str, base: TFPolicyV2Type) -> TFPolicyV2Type:
+    """Construct an ImpalaTFPolicy inheriting either dynamic or eager base policies.
+
+    Args:
+        base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2.
+
+    Returns:
+        A TF Policy to be used with Impala.
+    """
+    # VTrace mixins are placed in front of more general mixins to make sure
+    # their functions like optimizer() overrides all the other implementations
+    # (e.g., LearningRateSchedule.optimizer())
+    class ImpalaTFPolicy(
+        VTraceClipGradients,
+        VTraceOptimizer,
+        LearningRateSchedule,
+        EntropyCoeffSchedule,
+        GradStatsMixin,
+        ValueNetworkMixin,
+        base,
+    ):
+        def __init__(
+            self,
+            observation_space,
+            action_space,
+            config,
+            existing_model=None,
+            existing_inputs=None,
+        ):
+            # First thing first, enable eager execution if necessary.
+            base.enable_eager_execution_if_necessary()
+
+            # Initialize base class.
+            base.__init__(
+                self,
+                observation_space,
+                action_space,
+                config,
+                existing_inputs=existing_inputs,
+                existing_model=existing_model,
+            )
+            ValueNetworkMixin.__init__(self, config)
+
+            # If Learner API is used, we don't need any loss-specific mixins.
+            # However, we also would like to avoid creating special Policy-subclasses
+            # for this as the entire Policy concept will soon not be used anymore with
+            # the new Learner- and RLModule APIs.
+            GradStatsMixin.__init__(self)
+            VTraceClipGradients.__init__(self)
+            VTraceOptimizer.__init__(self)
+            LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])
+            EntropyCoeffSchedule.__init__(
+                self, config["entropy_coeff"], config["entropy_coeff_schedule"]
+            )
+
+            # Note: this is a bit ugly, but loss and optimizer initialization must
+            # happen after all the MixIns are initialized.
+            self.maybe_initialize_optimizer_and_loss()
+
+        @override(base)
+        def loss(
+            self,
+            model: Union[ModelV2, "tf.keras.Model"],
+            dist_class: Type[TFActionDistribution],
+            train_batch: SampleBatch,
+        ) -> Union[TensorType, List[TensorType]]:
+            model_out, _ = model(train_batch)
+            action_dist = dist_class(model_out, model)
+
+            if isinstance(self.action_space, gym.spaces.Discrete):
+                is_multidiscrete = False
+                output_hidden_shape = [self.action_space.n]
+            elif isinstance(self.action_space, gym.spaces.MultiDiscrete):
+                is_multidiscrete = True
+                output_hidden_shape = self.action_space.nvec.astype(np.int32)
+            else:
+                is_multidiscrete = False
+                output_hidden_shape = 1
+
+            def make_time_major(*args, **kw):
+                return _make_time_major(
+                    self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw
+                )
+
+            actions = train_batch[SampleBatch.ACTIONS]
+            dones = train_batch[SampleBatch.TERMINATEDS]
+            rewards = train_batch[SampleBatch.REWARDS]
+            behaviour_action_logp = train_batch[SampleBatch.ACTION_LOGP]
+            behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS]
+            unpacked_behaviour_logits = tf.split(
+                behaviour_logits, output_hidden_shape, axis=1
+            )
+            unpacked_outputs = tf.split(model_out, output_hidden_shape, axis=1)
+            values = model.value_function()
+            values_time_major = make_time_major(values)
+            bootstrap_values_time_major = make_time_major(
+                train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            )
+            bootstrap_value = bootstrap_values_time_major[-1]
+
+            if self.is_recurrent():
+                max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
+                mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
+                mask = tf.reshape(mask, [-1])
+            else:
+                mask = tf.ones_like(rewards)
+
+            # Prepare actions for loss
+            loss_actions = (
+                actions if is_multidiscrete else tf.expand_dims(actions, axis=1)
+            )
+
+            # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc.
+            self.vtrace_loss = VTraceLoss(
+                actions=make_time_major(loss_actions),
+                actions_logp=make_time_major(action_dist.logp(actions)),
+                actions_entropy=make_time_major(action_dist.multi_entropy()),
+                dones=make_time_major(dones),
+                behaviour_action_logp=make_time_major(behaviour_action_logp),
+                behaviour_logits=make_time_major(unpacked_behaviour_logits),
+                target_logits=make_time_major(unpacked_outputs),
+                discount=self.config["gamma"],
+                rewards=make_time_major(rewards),
+                values=values_time_major,
+                bootstrap_value=bootstrap_value,
+                dist_class=Categorical if is_multidiscrete else dist_class,
+                model=model,
+                valid_mask=make_time_major(mask),
+                config=self.config,
+                vf_loss_coeff=self.config["vf_loss_coeff"],
+                entropy_coeff=self.entropy_coeff,
+                clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
+                clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"],
+            )
+
+            if self.config.get("_separate_vf_optimizer"):
+                return self.vtrace_loss.loss_wo_vf, self.vtrace_loss.vf_loss
+            else:
+                return self.vtrace_loss.total_loss
+
+        @override(base)
+        def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
+            values_batched = _make_time_major(
+                self,
+                train_batch.get(SampleBatch.SEQ_LENS),
+                self.model.value_function(),
+            )
+
+            return {
+                "cur_lr": tf.cast(self.cur_lr, tf.float64),
+                "policy_loss": self.vtrace_loss.mean_pi_loss,
+                "entropy": self.vtrace_loss.mean_entropy,
+                "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64),
+                "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables()),
+                "vf_loss": self.vtrace_loss.mean_vf_loss,
+                "vf_explained_var": explained_variance(
+                    tf.reshape(self.vtrace_loss.value_targets, [-1]),
+                    tf.reshape(values_batched, [-1]),
+                ),
+            }
+
+        @override(base)
+        def postprocess_trajectory(
+            self,
+            sample_batch: SampleBatch,
+            other_agent_batches: Optional[SampleBatch] = None,
+            episode=None,
+        ):
+            # Call super's postprocess_trajectory first.
+            # sample_batch = super().postprocess_trajectory(
+            #    sample_batch, other_agent_batches, episode
+            # )
+
+            if self.config["vtrace"]:
+                # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
+                # inside the loss for vtrace calculations.
+                sample_batch = compute_bootstrap_value(sample_batch, self)
+
+            return sample_batch
+
+        @override(base)
+        def get_batch_divisibility_req(self) -> int:
+            return self.config["rollout_fragment_length"]
+
+    ImpalaTFPolicy.__name__ = name
+    ImpalaTFPolicy.__qualname__ = name
+
+    return ImpalaTFPolicy
+
+
+ImpalaTF1Policy = get_impala_tf_policy("ImpalaTF1Policy", DynamicTFPolicyV2)
+ImpalaTF2Policy = get_impala_tf_policy("ImpalaTF2Policy", EagerTFPolicyV2)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_torch_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_torch_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c174149f7c6042c6bf7b7984cde06359dba718f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_torch_policy.py
@@ -0,0 +1,424 @@
+import gymnasium as gym
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Type, Union
+
+import ray
+from ray.rllib.evaluation.postprocessing import compute_bootstrap_value
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.action_dist import ActionDistribution
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_mixins import (
+    EntropyCoeffSchedule,
+    LearningRateSchedule,
+    ValueNetworkMixin,
+)
+from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.torch_utils import (
+    apply_grad_clipping,
+    explained_variance,
+    global_norm,
+    sequence_mask,
+)
+from ray.rllib.utils.typing import TensorType
+
+torch, nn = try_import_torch()
+
+logger = logging.getLogger(__name__)
+
+
+class VTraceLoss:
+    def __init__(
+        self,
+        actions,
+        actions_logp,
+        actions_entropy,
+        dones,
+        behaviour_action_logp,
+        behaviour_logits,
+        target_logits,
+        discount,
+        rewards,
+        values,
+        bootstrap_value,
+        dist_class,
+        model,
+        valid_mask,
+        config,
+        vf_loss_coeff=0.5,
+        entropy_coeff=0.01,
+        clip_rho_threshold=1.0,
+        clip_pg_rho_threshold=1.0,
+    ):
+        """Policy gradient loss with vtrace importance weighting.
+
+        VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
+        batch_size. The reason we need to know `B` is for V-trace to properly
+        handle episode cut boundaries.
+
+        Args:
+            actions: An int|float32 tensor of shape [T, B, ACTION_SPACE].
+            actions_logp: A float32 tensor of shape [T, B].
+            actions_entropy: A float32 tensor of shape [T, B].
+            dones: A bool tensor of shape [T, B].
+            behaviour_action_logp: Tensor of shape [T, B].
+            behaviour_logits: A list with length of ACTION_SPACE of float32
+                tensors of shapes
+                [T, B, ACTION_SPACE[0]],
+                ...,
+                [T, B, ACTION_SPACE[-1]]
+            target_logits: A list with length of ACTION_SPACE of float32
+                tensors of shapes
+                [T, B, ACTION_SPACE[0]],
+                ...,
+                [T, B, ACTION_SPACE[-1]]
+            discount: A float32 scalar.
+            rewards: A float32 tensor of shape [T, B].
+            values: A float32 tensor of shape [T, B].
+            bootstrap_value: A float32 tensor of shape [B].
+            dist_class: action distribution class for logits.
+            valid_mask: A bool tensor of valid RNN input elements (#2992).
+            config: Algorithm config dict.
+        """
+        import ray.rllib.algorithms.impala.vtrace_torch as vtrace
+
+        if valid_mask is None:
+            valid_mask = torch.ones_like(actions_logp)
+
+        # Compute vtrace on the CPU for better perf
+        # (devices handled inside `vtrace.multi_from_logits`).
+        device = behaviour_action_logp[0].device
+        self.vtrace_returns = vtrace.multi_from_logits(
+            behaviour_action_log_probs=behaviour_action_logp,
+            behaviour_policy_logits=behaviour_logits,
+            target_policy_logits=target_logits,
+            actions=torch.unbind(actions, dim=2),
+            discounts=(1.0 - dones.float()) * discount,
+            rewards=rewards,
+            values=values,
+            bootstrap_value=bootstrap_value,
+            dist_class=dist_class,
+            model=model,
+            clip_rho_threshold=clip_rho_threshold,
+            clip_pg_rho_threshold=clip_pg_rho_threshold,
+        )
+        # Move v-trace results back to GPU for actual loss computing.
+        self.value_targets = self.vtrace_returns.vs.to(device)
+
+        # The policy gradients loss.
+        self.pi_loss = -torch.sum(
+            actions_logp * self.vtrace_returns.pg_advantages.to(device) * valid_mask
+        )
+
+        # The baseline loss.
+        delta = (values - self.value_targets) * valid_mask
+        self.vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0))
+
+        # The entropy loss.
+        self.entropy = torch.sum(actions_entropy * valid_mask)
+        self.mean_entropy = self.entropy / torch.sum(valid_mask)
+
+        # The summed weighted loss.
+        self.total_loss = self.pi_loss - self.entropy * entropy_coeff
+
+        # Optional vf loss (or in a separate term due to separate
+        # optimizers/networks).
+        self.loss_wo_vf = self.total_loss
+        if not config["_separate_vf_optimizer"]:
+            self.total_loss += self.vf_loss * vf_loss_coeff
+
+
+def make_time_major(policy, seq_lens, tensor):
+    """Swaps batch and trajectory axis.
+
+    Args:
+        policy: Policy reference
+        seq_lens: Sequence lengths if recurrent or None
+        tensor: A tensor or list of tensors to reshape.
+
+    Returns:
+        res: A tensor with swapped axes or a list of tensors with
+        swapped axes.
+    """
+    if isinstance(tensor, (list, tuple)):
+        return [make_time_major(policy, seq_lens, t) for t in tensor]
+
+    if policy.is_recurrent():
+        B = seq_lens.shape[0]
+        T = tensor.shape[0] // B
+    else:
+        # Important: chop the tensor into batches at known episode cut
+        # boundaries.
+        # TODO: (sven) this is kind of a hack and won't work for
+        #  batch_mode=complete_episodes.
+        T = policy.config["rollout_fragment_length"]
+        B = tensor.shape[0] // T
+    rs = torch.reshape(tensor, [B, T] + list(tensor.shape[1:]))
+
+    # Swap B and T axes.
+    res = torch.transpose(rs, 1, 0)
+
+    return res
+
+
+class VTraceOptimizer:
+    """Optimizer function for VTrace torch policies."""
+
+    def __init__(self):
+        pass
+
+    def optimizer(
+        self,
+    ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]:
+
+        if self.config["_separate_vf_optimizer"]:
+            # Figure out, which parameters of the model belong to the value
+            # function (and which to the policy net).
+            dummy_batch = self._lazy_tensor_dict(
+                self._get_dummy_batch_from_view_requirements()
+            )
+            # Zero out all gradients (set to None)
+            for param in self.model.parameters():
+                param.grad = None
+            # Perform a dummy forward pass (through the policy net, which should be
+            # separated from the value function in this particular user setup).
+            out = self.model(dummy_batch)
+            # Perform a (dummy) backward pass to be able to see, which params have
+            # gradients and are therefore used for the policy computations (vs vf
+            # computations).
+            torch.sum(out[0]).backward()  # [0] -> Model returns out and state-outs.
+            # Collect policy vs value function params separately.
+            policy_params = []
+            value_params = []
+            for param in self.model.parameters():
+                if param.grad is None:
+                    value_params.append(param)
+                else:
+                    policy_params.append(param)
+            if self.config["opt_type"] == "adam":
+                return (
+                    torch.optim.Adam(params=policy_params, lr=self.cur_lr),
+                    torch.optim.Adam(params=value_params, lr=self.cur_lr2),
+                )
+            else:
+                raise NotImplementedError
+
+        if self.config["opt_type"] == "adam":
+            return torch.optim.Adam(params=self.model.parameters(), lr=self.cur_lr)
+        else:
+            return torch.optim.RMSprop(
+                params=self.model.parameters(),
+                lr=self.cur_lr,
+                weight_decay=self.config["decay"],
+                momentum=self.config["momentum"],
+                eps=self.config["epsilon"],
+            )
+
+
+# VTrace mixins are placed in front of more general mixins to make sure
+# their functions like optimizer() overrides all the other implementations
+# (e.g., LearningRateSchedule.optimizer())
+class ImpalaTorchPolicy(
+    VTraceOptimizer,
+    LearningRateSchedule,
+    EntropyCoeffSchedule,
+    ValueNetworkMixin,
+    TorchPolicyV2,
+):
+    """PyTorch policy class used with IMPALA."""
+
+    def __init__(self, observation_space, action_space, config):
+        config = dict(
+            ray.rllib.algorithms.impala.impala.IMPALAConfig().to_dict(), **config
+        )
+        config["enable_rl_module_and_learner"] = False
+        config["enable_env_runner_and_connector_v2"] = False
+
+        # If Learner API is used, we don't need any loss-specific mixins.
+        # However, we also would like to avoid creating special Policy-subclasses
+        # for this as the entire Policy concept will soon not be used anymore with
+        # the new Learner- and RLModule APIs.
+        VTraceOptimizer.__init__(self)
+        # Need to initialize learning rate variable before calling
+        # TorchPolicyV2.__init__.
+        lr_schedule_additional_args = []
+        if config.get("_separate_vf_optimizer"):
+            lr_schedule_additional_args = (
+                [config["_lr_vf"][0][1], config["_lr_vf"]]
+                if isinstance(config["_lr_vf"], (list, tuple))
+                else [config["_lr_vf"], None]
+            )
+        LearningRateSchedule.__init__(
+            self, config["lr"], config["lr_schedule"], *lr_schedule_additional_args
+        )
+        EntropyCoeffSchedule.__init__(
+            self, config["entropy_coeff"], config["entropy_coeff_schedule"]
+        )
+
+        TorchPolicyV2.__init__(
+            self,
+            observation_space,
+            action_space,
+            config,
+            max_seq_len=config["model"]["max_seq_len"],
+        )
+
+        ValueNetworkMixin.__init__(self, config)
+
+        self._initialize_loss_from_dummy_batch()
+
+    @override(TorchPolicyV2)
+    def loss(
+        self,
+        model: ModelV2,
+        dist_class: Type[ActionDistribution],
+        train_batch: SampleBatch,
+    ) -> Union[TensorType, List[TensorType]]:
+        model_out, _ = model(train_batch)
+        action_dist = dist_class(model_out, model)
+
+        if isinstance(self.action_space, gym.spaces.Discrete):
+            is_multidiscrete = False
+            output_hidden_shape = [self.action_space.n]
+        elif isinstance(self.action_space, gym.spaces.MultiDiscrete):
+            is_multidiscrete = True
+            output_hidden_shape = self.action_space.nvec.astype(np.int32)
+        else:
+            is_multidiscrete = False
+            output_hidden_shape = 1
+
+        def _make_time_major(*args, **kw):
+            return make_time_major(
+                self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw
+            )
+
+        actions = train_batch[SampleBatch.ACTIONS]
+        dones = train_batch[SampleBatch.TERMINATEDS]
+        rewards = train_batch[SampleBatch.REWARDS]
+        behaviour_action_logp = train_batch[SampleBatch.ACTION_LOGP]
+        behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS]
+        if isinstance(output_hidden_shape, (list, tuple, np.ndarray)):
+            unpacked_behaviour_logits = torch.split(
+                behaviour_logits, list(output_hidden_shape), dim=1
+            )
+            unpacked_outputs = torch.split(model_out, list(output_hidden_shape), dim=1)
+        else:
+            unpacked_behaviour_logits = torch.chunk(
+                behaviour_logits, output_hidden_shape, dim=1
+            )
+            unpacked_outputs = torch.chunk(model_out, output_hidden_shape, dim=1)
+        values = model.value_function()
+        values_time_major = _make_time_major(values)
+        bootstrap_values_time_major = _make_time_major(
+            train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
+        bootstrap_value = bootstrap_values_time_major[-1]
+
+        if self.is_recurrent():
+            max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
+            mask_orig = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
+            mask = torch.reshape(mask_orig, [-1])
+        else:
+            mask = torch.ones_like(rewards)
+
+        # Prepare actions for loss.
+        loss_actions = actions if is_multidiscrete else torch.unsqueeze(actions, dim=1)
+
+        # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc.
+        loss = VTraceLoss(
+            actions=_make_time_major(loss_actions),
+            actions_logp=_make_time_major(action_dist.logp(actions)),
+            actions_entropy=_make_time_major(action_dist.entropy()),
+            dones=_make_time_major(dones),
+            behaviour_action_logp=_make_time_major(behaviour_action_logp),
+            behaviour_logits=_make_time_major(unpacked_behaviour_logits),
+            target_logits=_make_time_major(unpacked_outputs),
+            discount=self.config["gamma"],
+            rewards=_make_time_major(rewards),
+            values=values_time_major,
+            bootstrap_value=bootstrap_value,
+            dist_class=TorchCategorical if is_multidiscrete else dist_class,
+            model=model,
+            valid_mask=_make_time_major(mask),
+            config=self.config,
+            vf_loss_coeff=self.config["vf_loss_coeff"],
+            entropy_coeff=self.entropy_coeff,
+            clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
+            clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"],
+        )
+
+        # Store values for stats function in model (tower), such that for
+        # multi-GPU, we do not override them during the parallel loss phase.
+        model.tower_stats["pi_loss"] = loss.pi_loss
+        model.tower_stats["vf_loss"] = loss.vf_loss
+        model.tower_stats["entropy"] = loss.entropy
+        model.tower_stats["mean_entropy"] = loss.mean_entropy
+        model.tower_stats["total_loss"] = loss.total_loss
+
+        values_batched = make_time_major(
+            self,
+            train_batch.get(SampleBatch.SEQ_LENS),
+            values,
+        )
+        model.tower_stats["vf_explained_var"] = explained_variance(
+            torch.reshape(loss.value_targets, [-1]), torch.reshape(values_batched, [-1])
+        )
+
+        if self.config.get("_separate_vf_optimizer"):
+            return loss.loss_wo_vf, loss.vf_loss
+        else:
+            return loss.total_loss
+
+    @override(TorchPolicyV2)
+    def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
+        return convert_to_numpy(
+            {
+                "cur_lr": self.cur_lr,
+                "total_loss": torch.mean(
+                    torch.stack(self.get_tower_stats("total_loss"))
+                ),
+                "policy_loss": torch.mean(torch.stack(self.get_tower_stats("pi_loss"))),
+                "entropy": torch.mean(
+                    torch.stack(self.get_tower_stats("mean_entropy"))
+                ),
+                "entropy_coeff": self.entropy_coeff,
+                "var_gnorm": global_norm(self.model.trainable_variables()),
+                "vf_loss": torch.mean(torch.stack(self.get_tower_stats("vf_loss"))),
+                "vf_explained_var": torch.mean(
+                    torch.stack(self.get_tower_stats("vf_explained_var"))
+                ),
+            }
+        )
+
+    @override(TorchPolicyV2)
+    def postprocess_trajectory(
+        self,
+        sample_batch: SampleBatch,
+        other_agent_batches: Optional[SampleBatch] = None,
+        episode=None,
+    ):
+        # Call super's postprocess_trajectory first.
+        # sample_batch = super().postprocess_trajectory(
+        #    sample_batch, other_agent_batches, episode
+        # )
+
+        if self.config["vtrace"]:
+            # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
+            # inside the loss for vtrace calculations.
+            sample_batch = compute_bootstrap_value(sample_batch, self)
+
+        return sample_batch
+
+    @override(TorchPolicyV2)
+    def extra_grad_process(
+        self, optimizer: "torch.optim.Optimizer", loss: TensorType
+    ) -> Dict[str, TensorType]:
+        return apply_grad_clipping(self, optimizer, loss)
+
+    @override(TorchPolicyV2)
+    def get_batch_divisibility_req(self) -> int:
+        return self.config["rollout_fragment_length"]
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f65b76a7f53de6fb3f436b8063341c057529f25b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/utils.py
@@ -0,0 +1,96 @@
+from collections import defaultdict, deque
+
+import numpy as np
+
+
+class _SleepTimeController:
+    def __init__(self):
+        self.L = 0.0
+        self.H = 0.4
+
+        self._recompute_candidates()
+
+        # Defaultdict mapping.
+        self.results = defaultdict(lambda: deque(maxlen=3))
+
+        self.iteration = 0
+
+    def _recompute_candidates(self):
+        self.center = (self.L + self.H) / 2
+        self.low = (self.L + self.center) / 2
+        self.high = (self.H + self.center) / 2
+
+        # Expand a little if range becomes too narrow to avoid
+        # overoptimization.
+        if self.H - self.L < 0.00001:
+            self.L = max(self.center - 0.1, 0.0)
+            self.H = min(self.center + 0.1, 1.0)
+            self._recompute_candidates()
+            # Reduce results, just in case it has grown too much.
+            c, l, h = (
+                self.results[self.center],
+                self.results[self.low],
+                self.results[self.high],
+            )
+            self.results = defaultdict(lambda: deque(maxlen=3))
+            self.results[self.center] = c
+            self.results[self.low] = l
+            self.results[self.high] = h
+
+    @property
+    def current(self):
+        if len(self.results[self.center]) < 3:
+            return self.center
+        elif len(self.results[self.low]) < 3:
+            return self.low
+        else:
+            return self.high
+
+    def log_result(self, performance):
+        self.iteration += 1
+
+        # Skip first 2 iterations for ignoring warm-up effect.
+        if self.iteration < 2:
+            return
+
+        self.results[self.current].append(performance)
+
+        # If all candidates have at least 3 results logged, re-evaluate
+        # and compute new L and H.
+        center, low, high = self.center, self.low, self.high
+        if (
+            len(self.results[center]) == 3
+            and len(self.results[low]) == 3
+            and len(self.results[high]) == 3
+        ):
+            perf_center = np.mean(self.results[center])
+            perf_low = np.mean(self.results[low])
+            perf_high = np.mean(self.results[high])
+            # Case: `center` is best.
+            if perf_center > perf_low and perf_center > perf_high:
+                self.L = low
+                self.H = high
+                # Erase low/high results: We'll not use these again.
+                self.results.pop(low, None)
+                self.results.pop(high, None)
+            # Case: `low` is best.
+            elif perf_low > perf_center and perf_low > perf_high:
+                self.H = center
+                # Erase center/high results: We'll not use these again.
+                self.results.pop(center, None)
+                self.results.pop(high, None)
+            # Case: `high` is best.
+            else:
+                self.L = center
+                # Erase center/low results: We'll not use these again.
+                self.results.pop(center, None)
+                self.results.pop(low, None)
+
+            self._recompute_candidates()
+
+
+if __name__ == "__main__":
+    controller = _SleepTimeController()
+    for _ in range(1000):
+        performance = np.random.random()
+        controller.log_result(performance)
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_tf.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..61c6a7e366d52e20a0d2c225749461c81c51b04d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_tf.py
@@ -0,0 +1,425 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions to compute V-trace off-policy actor critic targets.
+
+For details and theory see:
+
+"IMPALA: Scalable Distributed Deep-RL with
+Importance Weighted Actor-Learner Architectures"
+by Espeholt, Soyer, Munos et al.
+
+See https://arxiv.org/abs/1802.01561 for the full paper.
+
+In addition to the original paper's code, changes have been made
+to support MultiDiscrete action spaces. behaviour_policy_logits,
+target_policy_logits and actions parameters in the entry point
+multi_from_logits method accepts lists of tensors instead of just
+tensors.
+"""
+
+import collections
+
+from ray.rllib.models.tf.tf_action_dist import Categorical
+from ray.rllib.utils.framework import try_import_tf
+
+tf1, tf, tfv = try_import_tf()
+
+VTraceFromLogitsReturns = collections.namedtuple(
+    "VTraceFromLogitsReturns",
+    [
+        "vs",
+        "pg_advantages",
+        "log_rhos",
+        "behaviour_action_log_probs",
+        "target_action_log_probs",
+    ],
+)
+
+VTraceReturns = collections.namedtuple("VTraceReturns", "vs pg_advantages")
+
+
+def log_probs_from_logits_and_actions(
+    policy_logits, actions, dist_class=Categorical, model=None
+):
+    return multi_log_probs_from_logits_and_actions(
+        [policy_logits], [actions], dist_class, model
+    )[0]
+
+
+def multi_log_probs_from_logits_and_actions(policy_logits, actions, dist_class, model):
+    """Computes action log-probs from policy logits and actions.
+
+    In the notation used throughout documentation and comments, T refers to the
+    time dimension ranging from 0 to T-1. B refers to the batch size and
+    ACTION_SPACE refers to the list of numbers each representing a number of
+    actions.
+
+    Args:
+        policy_logits: A list with length of ACTION_SPACE of float32
+            tensors of shapes [T, B, ACTION_SPACE[0]], ...,
+            [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities
+            parameterizing a softmax policy.
+        actions: A list with length of ACTION_SPACE of tensors of shapes
+            [T, B, ...], ..., [T, B, ...]
+            with actions.
+        dist_class: Python class of the action distribution.
+
+    Returns:
+        A list with length of ACTION_SPACE of float32 tensors of shapes
+            [T, B], ..., [T, B] corresponding to the sampling log probability
+            of the chosen action w.r.t. the policy.
+    """
+    log_probs = []
+    for i in range(len(policy_logits)):
+        p_shape = tf.shape(policy_logits[i])
+        a_shape = tf.shape(actions[i])
+        policy_logits_flat = tf.reshape(
+            policy_logits[i], tf.concat([[-1], p_shape[2:]], axis=0)
+        )
+        actions_flat = tf.reshape(actions[i], tf.concat([[-1], a_shape[2:]], axis=0))
+        log_probs.append(
+            tf.reshape(
+                dist_class(policy_logits_flat, model).logp(actions_flat), a_shape[:2]
+            )
+        )
+
+    return log_probs
+
+
+def from_logits(
+    behaviour_policy_logits,
+    target_policy_logits,
+    actions,
+    discounts,
+    rewards,
+    values,
+    bootstrap_value,
+    dist_class=Categorical,
+    model=None,
+    clip_rho_threshold=1.0,
+    clip_pg_rho_threshold=1.0,
+    name="vtrace_from_logits",
+):
+    """multi_from_logits wrapper used only for tests"""
+
+    res = multi_from_logits(
+        [behaviour_policy_logits],
+        [target_policy_logits],
+        [actions],
+        discounts,
+        rewards,
+        values,
+        bootstrap_value,
+        dist_class,
+        model,
+        clip_rho_threshold=clip_rho_threshold,
+        clip_pg_rho_threshold=clip_pg_rho_threshold,
+        name=name,
+    )
+
+    return VTraceFromLogitsReturns(
+        vs=res.vs,
+        pg_advantages=res.pg_advantages,
+        log_rhos=res.log_rhos,
+        behaviour_action_log_probs=tf.squeeze(res.behaviour_action_log_probs, axis=0),
+        target_action_log_probs=tf.squeeze(res.target_action_log_probs, axis=0),
+    )
+
+
+def multi_from_logits(
+    behaviour_policy_logits,
+    target_policy_logits,
+    actions,
+    discounts,
+    rewards,
+    values,
+    bootstrap_value,
+    dist_class,
+    model,
+    behaviour_action_log_probs=None,
+    clip_rho_threshold=1.0,
+    clip_pg_rho_threshold=1.0,
+    name="vtrace_from_logits",
+):
+    r"""V-trace for softmax policies.
+
+    Calculates V-trace actor critic targets for softmax polices as described in
+
+    "IMPALA: Scalable Distributed Deep-RL with
+    Importance Weighted Actor-Learner Architectures"
+    by Espeholt, Soyer, Munos et al.
+
+    Target policy refers to the policy we are interested in improving and
+    behaviour policy refers to the policy that generated the given
+    rewards and actions.
+
+    In the notation used throughout documentation and comments, T refers to the
+    time dimension ranging from 0 to T-1. B refers to the batch size and
+    ACTION_SPACE refers to the list of numbers each representing a number of
+    actions.
+
+    Args:
+      behaviour_policy_logits: A list with length of ACTION_SPACE of float32
+        tensors of shapes
+        [T, B, ACTION_SPACE[0]],
+        ...,
+        [T, B, ACTION_SPACE[-1]]
+        with un-normalized log-probabilities parameterizing the softmax behaviour
+        policy.
+      target_policy_logits: A list with length of ACTION_SPACE of float32
+        tensors of shapes
+        [T, B, ACTION_SPACE[0]],
+        ...,
+        [T, B, ACTION_SPACE[-1]]
+        with un-normalized log-probabilities parameterizing the softmax target
+        policy.
+      actions: A list with length of ACTION_SPACE of
+        tensors of shapes
+        [T, B, ...],
+        ...,
+        [T, B, ...]
+        with actions sampled from the behaviour policy.
+      discounts: A float32 tensor of shape [T, B] with the discount encountered
+        when following the behaviour policy.
+      rewards: A float32 tensor of shape [T, B] with the rewards generated by
+        following the behaviour policy.
+      values: A float32 tensor of shape [T, B] with the value function estimates
+        wrt. the target policy.
+      bootstrap_value: A float32 of shape [B] with the value function estimate at
+        time T.
+      dist_class: action distribution class for the logits.
+      model: backing ModelV2 instance
+      behaviour_action_log_probs: precalculated values of the behaviour actions
+      clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
+        importance weights (rho) when calculating the baseline targets (vs).
+        rho^bar in the paper.
+      clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
+        on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
+      name: The name scope that all V-trace operations will be created in.
+
+    Returns:
+      A `VTraceFromLogitsReturns` namedtuple with the following fields:
+        vs: A float32 tensor of shape [T, B]. Can be used as target to train a
+            baseline (V(x_t) - vs_t)^2.
+        pg_advantages: A float 32 tensor of shape [T, B]. Can be used as an
+          estimate of the advantage in the calculation of policy gradients.
+        log_rhos: A float32 tensor of shape [T, B] containing the log importance
+          sampling weights (log rhos).
+        behaviour_action_log_probs: A float32 tensor of shape [T, B] containing
+          behaviour policy action log probabilities (log \mu(a_t)).
+        target_action_log_probs: A float32 tensor of shape [T, B] containing
+          target policy action probabilities (log \pi(a_t)).
+    """
+
+    for i in range(len(behaviour_policy_logits)):
+        behaviour_policy_logits[i] = tf.convert_to_tensor(
+            behaviour_policy_logits[i], dtype=tf.float32
+        )
+        target_policy_logits[i] = tf.convert_to_tensor(
+            target_policy_logits[i], dtype=tf.float32
+        )
+
+        # Make sure tensor ranks are as expected.
+        # The rest will be checked by from_action_log_probs.
+        behaviour_policy_logits[i].shape.assert_has_rank(3)
+        target_policy_logits[i].shape.assert_has_rank(3)
+
+    with tf1.name_scope(
+        name,
+        values=[
+            behaviour_policy_logits,
+            target_policy_logits,
+            actions,
+            discounts,
+            rewards,
+            values,
+            bootstrap_value,
+        ],
+    ):
+        target_action_log_probs = multi_log_probs_from_logits_and_actions(
+            target_policy_logits, actions, dist_class, model
+        )
+
+        if len(behaviour_policy_logits) > 1 or behaviour_action_log_probs is None:
+            # can't use precalculated values, recompute them. Note that
+            # recomputing won't work well for autoregressive action dists
+            # which may have variables not captured by 'logits'
+            behaviour_action_log_probs = multi_log_probs_from_logits_and_actions(
+                behaviour_policy_logits, actions, dist_class, model
+            )
+
+        log_rhos = get_log_rhos(target_action_log_probs, behaviour_action_log_probs)
+
+        vtrace_returns = from_importance_weights(
+            log_rhos=log_rhos,
+            discounts=discounts,
+            rewards=rewards,
+            values=values,
+            bootstrap_value=bootstrap_value,
+            clip_rho_threshold=clip_rho_threshold,
+            clip_pg_rho_threshold=clip_pg_rho_threshold,
+        )
+
+        return VTraceFromLogitsReturns(
+            log_rhos=log_rhos,
+            behaviour_action_log_probs=behaviour_action_log_probs,
+            target_action_log_probs=target_action_log_probs,
+            **vtrace_returns._asdict()
+        )
+
+
+def from_importance_weights(
+    log_rhos,
+    discounts,
+    rewards,
+    values,
+    bootstrap_value,
+    clip_rho_threshold=1.0,
+    clip_pg_rho_threshold=1.0,
+    name="vtrace_from_importance_weights",
+):
+    r"""V-trace from log importance weights.
+
+    Calculates V-trace actor critic targets as described in
+
+    "IMPALA: Scalable Distributed Deep-RL with
+    Importance Weighted Actor-Learner Architectures"
+    by Espeholt, Soyer, Munos et al.
+
+    In the notation used throughout documentation and comments, T refers to the
+    time dimension ranging from 0 to T-1. B refers to the batch size. This code
+    also supports the case where all tensors have the same number of additional
+    dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C],
+    `bootstrap_value` is [B, C].
+
+    Args:
+      log_rhos: A float32 tensor of shape [T, B] representing the
+        log importance sampling weights, i.e.
+        log(target_policy(a) / behaviour_policy(a)). V-trace performs operations
+        on rhos in log-space for numerical stability.
+      discounts: A float32 tensor of shape [T, B] with discounts encountered when
+        following the behaviour policy.
+      rewards: A float32 tensor of shape [T, B] containing rewards generated by
+        following the behaviour policy.
+      values: A float32 tensor of shape [T, B] with the value function estimates
+        wrt. the target policy.
+      bootstrap_value: A float32 of shape [B] with the value function estimate at
+        time T.
+      clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
+        importance weights (rho) when calculating the baseline targets (vs).
+        rho^bar in the paper. If None, no clipping is applied.
+      clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
+        on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). If
+        None, no clipping is applied.
+      name: The name scope that all V-trace operations will be created in.
+
+    Returns:
+      A VTraceReturns namedtuple (vs, pg_advantages) where:
+        vs: A float32 tensor of shape [T, B]. Can be used as target to
+          train a baseline (V(x_t) - vs_t)^2.
+        pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
+          advantage in the calculation of policy gradients.
+    """
+    log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32)
+    discounts = tf.convert_to_tensor(discounts, dtype=tf.float32)
+    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
+    values = tf.convert_to_tensor(values, dtype=tf.float32)
+    bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32)
+    if clip_rho_threshold is not None:
+        clip_rho_threshold = tf.convert_to_tensor(clip_rho_threshold, dtype=tf.float32)
+    if clip_pg_rho_threshold is not None:
+        clip_pg_rho_threshold = tf.convert_to_tensor(
+            clip_pg_rho_threshold, dtype=tf.float32
+        )
+
+    # Make sure tensor ranks are consistent.
+    rho_rank = log_rhos.shape.ndims  # Usually 2.
+    values.shape.assert_has_rank(rho_rank)
+    bootstrap_value.shape.assert_has_rank(rho_rank - 1)
+    discounts.shape.assert_has_rank(rho_rank)
+    rewards.shape.assert_has_rank(rho_rank)
+    if clip_rho_threshold is not None:
+        clip_rho_threshold.shape.assert_has_rank(0)
+    if clip_pg_rho_threshold is not None:
+        clip_pg_rho_threshold.shape.assert_has_rank(0)
+
+    with tf1.name_scope(
+        name, values=[log_rhos, discounts, rewards, values, bootstrap_value]
+    ):
+        rhos = tf.math.exp(log_rhos)
+        if clip_rho_threshold is not None:
+            clipped_rhos = tf.minimum(clip_rho_threshold, rhos, name="clipped_rhos")
+        else:
+            clipped_rhos = rhos
+
+        cs = tf.minimum(1.0, rhos, name="cs")
+        # Append bootstrapped value to get [v1, ..., v_t+1]
+        values_t_plus_1 = tf.concat(
+            [values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0
+        )
+        deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)
+
+        # All sequences are reversed, computation starts from the back.
+        sequences = (
+            tf.reverse(discounts, axis=[0]),
+            tf.reverse(cs, axis=[0]),
+            tf.reverse(deltas, axis=[0]),
+        )
+
+        # V-trace vs are calculated through a scan from the back to the
+        # beginning of the given trajectory.
+        def scanfunc(acc, sequence_item):
+            discount_t, c_t, delta_t = sequence_item
+            return delta_t + discount_t * c_t * acc
+
+        initial_values = tf.zeros_like(bootstrap_value)
+        vs_minus_v_xs = tf.nest.map_structure(
+            tf.stop_gradient,
+            tf.scan(
+                fn=scanfunc,
+                elems=sequences,
+                initializer=initial_values,
+                parallel_iterations=1,
+                name="scan",
+            ),
+        )
+        # Reverse the results back to original order.
+        vs_minus_v_xs = tf.reverse(vs_minus_v_xs, [0], name="vs_minus_v_xs")
+
+        # Add V(x_s) to get v_s.
+        vs = tf.add(vs_minus_v_xs, values, name="vs")
+
+        # Advantage for policy gradient.
+        vs_t_plus_1 = tf.concat([vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
+        if clip_pg_rho_threshold is not None:
+            clipped_pg_rhos = tf.minimum(
+                clip_pg_rho_threshold, rhos, name="clipped_pg_rhos"
+            )
+        else:
+            clipped_pg_rhos = rhos
+        pg_advantages = clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values)
+
+        # Make sure no gradients backpropagated through the returned values.
+        return VTraceReturns(
+            vs=tf.stop_gradient(vs), pg_advantages=tf.stop_gradient(pg_advantages)
+        )
+
+
+def get_log_rhos(target_action_log_probs, behaviour_action_log_probs):
+    """With the selected log_probs for multi-discrete actions of behaviour
+    and target policies we compute the log_rhos for calculating the vtrace."""
+    t = tf.stack(target_action_log_probs)
+    b = tf.stack(behaviour_action_log_probs)
+    log_rhos = tf.reduce_sum(t - b, axis=0)
+    return log_rhos
diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_torch.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b63a5181c7ac81e3ccd10477f7b6728a66cfe72b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_torch.py
@@ -0,0 +1,359 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch version of the functions to compute V-trace off-policy actor critic
+targets.
+
+For details and theory see:
+
+"IMPALA: Scalable Distributed Deep-RL with
+Importance Weighted Actor-Learner Architectures"
+by Espeholt, Soyer, Munos et al.
+
+See https://arxiv.org/abs/1802.01561 for the full paper.
+
+In addition to the original paper's code, changes have been made
+to support MultiDiscrete action spaces. behaviour_policy_logits,
+target_policy_logits and actions parameters in the entry point
+multi_from_logits method accepts lists of tensors instead of just
+tensors.
+"""
+
+from ray.rllib.algorithms.impala.vtrace_tf import VTraceFromLogitsReturns, VTraceReturns
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical
+from ray.rllib.utils import force_list
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+
+torch, nn = try_import_torch()
+
+
+def log_probs_from_logits_and_actions(
+    policy_logits, actions, dist_class=TorchCategorical, model=None
+):
+    return multi_log_probs_from_logits_and_actions(
+        [policy_logits], [actions], dist_class, model
+    )[0]
+
+
+def multi_log_probs_from_logits_and_actions(policy_logits, actions, dist_class, model):
+    """Computes action log-probs from policy logits and actions.
+
+    In the notation used throughout documentation and comments, T refers to the
+    time dimension ranging from 0 to T-1. B refers to the batch size and
+    ACTION_SPACE refers to the list of numbers each representing a number of
+    actions.
+
+    Args:
+        policy_logits: A list with length of ACTION_SPACE of float32
+            tensors of shapes [T, B, ACTION_SPACE[0]], ...,
+            [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities
+            parameterizing a softmax policy.
+        actions: A list with length of ACTION_SPACE of tensors of shapes
+            [T, B, ...], ..., [T, B, ...]
+            with actions.
+        dist_class: Python class of the action distribution.
+
+    Returns:
+        A list with length of ACTION_SPACE of float32 tensors of shapes
+            [T, B], ..., [T, B] corresponding to the sampling log probability
+            of the chosen action w.r.t. the policy.
+    """
+    log_probs = []
+    for i in range(len(policy_logits)):
+        p_shape = policy_logits[i].shape
+        a_shape = actions[i].shape
+        policy_logits_flat = torch.reshape(policy_logits[i], (-1,) + tuple(p_shape[2:]))
+        actions_flat = torch.reshape(actions[i], (-1,) + tuple(a_shape[2:]))
+        log_probs.append(
+            torch.reshape(
+                dist_class(policy_logits_flat, model).logp(actions_flat), a_shape[:2]
+            )
+        )
+
+    return log_probs
+
+
+def from_logits(
+    behaviour_policy_logits,
+    target_policy_logits,
+    actions,
+    discounts,
+    rewards,
+    values,
+    bootstrap_value,
+    dist_class=TorchCategorical,
+    model=None,
+    clip_rho_threshold=1.0,
+    clip_pg_rho_threshold=1.0,
+):
+    """multi_from_logits wrapper used only for tests"""
+
+    res = multi_from_logits(
+        [behaviour_policy_logits],
+        [target_policy_logits],
+        [actions],
+        discounts,
+        rewards,
+        values,
+        bootstrap_value,
+        dist_class,
+        model,
+        clip_rho_threshold=clip_rho_threshold,
+        clip_pg_rho_threshold=clip_pg_rho_threshold,
+    )
+
+    assert len(res.behaviour_action_log_probs) == 1
+    assert len(res.target_action_log_probs) == 1
+    return VTraceFromLogitsReturns(
+        vs=res.vs,
+        pg_advantages=res.pg_advantages,
+        log_rhos=res.log_rhos,
+        behaviour_action_log_probs=res.behaviour_action_log_probs[0],
+        target_action_log_probs=res.target_action_log_probs[0],
+    )
+
+
+def multi_from_logits(
+    behaviour_policy_logits,
+    target_policy_logits,
+    actions,
+    discounts,
+    rewards,
+    values,
+    bootstrap_value,
+    dist_class,
+    model,
+    behaviour_action_log_probs=None,
+    clip_rho_threshold=1.0,
+    clip_pg_rho_threshold=1.0,
+):
+    r"""V-trace for softmax policies.
+
+    Calculates V-trace actor critic targets for softmax polices as described in
+
+    "IMPALA: Scalable Distributed Deep-RL with
+    Importance Weighted Actor-Learner Architectures"
+    by Espeholt, Soyer, Munos et al.
+
+    Target policy refers to the policy we are interested in improving and
+    behaviour policy refers to the policy that generated the given
+    rewards and actions.
+
+    In the notation used throughout documentation and comments, T refers to the
+    time dimension ranging from 0 to T-1. B refers to the batch size and
+    ACTION_SPACE refers to the list of numbers each representing a number of
+    actions.
+
+    Args:
+        behaviour_policy_logits: A list with length of ACTION_SPACE of float32
+            tensors of shapes [T, B, ACTION_SPACE[0]], ...,
+            [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities
+            parameterizing the softmax behavior policy.
+        target_policy_logits: A list with length of ACTION_SPACE of float32
+            tensors of shapes [T, B, ACTION_SPACE[0]], ...,
+            [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities
+            parameterizing the softmax target policy.
+        actions: A list with length of ACTION_SPACE of tensors of shapes
+            [T, B, ...], ..., [T, B, ...]
+            with actions sampled from the behavior policy.
+        discounts: A float32 tensor of shape [T, B] with the discount
+            encountered when following the behavior policy.
+        rewards: A float32 tensor of shape [T, B] with the rewards generated by
+            following the behavior policy.
+        values: A float32 tensor of shape [T, B] with the value function
+            estimates wrt. the target policy.
+        bootstrap_value: A float32 of shape [B] with the value function
+            estimate at time T.
+        dist_class: action distribution class for the logits.
+        model: backing ModelV2 instance
+        behaviour_action_log_probs: Precalculated values of the behavior
+            actions.
+        clip_rho_threshold: A scalar float32 tensor with the clipping threshold
+            for importance weights (rho) when calculating the baseline targets
+            (vs). rho^bar in the paper.
+        clip_pg_rho_threshold: A scalar float32 tensor with the clipping
+            threshold on rho_s in:
+            \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
+
+    Returns:
+        A `VTraceFromLogitsReturns` namedtuple with the following fields:
+        vs: A float32 tensor of shape [T, B]. Can be used as target to train a
+            baseline (V(x_t) - vs_t)^2.
+        pg_advantages: A float 32 tensor of shape [T, B]. Can be used as an
+            estimate of the advantage in the calculation of policy gradients.
+        log_rhos: A float32 tensor of shape [T, B] containing the log
+            importance sampling weights (log rhos).
+        behaviour_action_log_probs: A float32 tensor of shape [T, B] containing
+            behaviour policy action log probabilities (log \mu(a_t)).
+        target_action_log_probs: A float32 tensor of shape [T, B] containing
+            target policy action probabilities (log \pi(a_t)).
+    """
+
+    behaviour_policy_logits = convert_to_torch_tensor(
+        behaviour_policy_logits, device="cpu"
+    )
+    target_policy_logits = convert_to_torch_tensor(target_policy_logits, device="cpu")
+    actions = convert_to_torch_tensor(actions, device="cpu")
+
+    # Make sure tensor ranks are as expected.
+    # The rest will be checked by from_action_log_probs.
+    for i in range(len(behaviour_policy_logits)):
+        assert len(behaviour_policy_logits[i].size()) == 3
+        assert len(target_policy_logits[i].size()) == 3
+
+    target_action_log_probs = multi_log_probs_from_logits_and_actions(
+        target_policy_logits, actions, dist_class, model
+    )
+
+    if len(behaviour_policy_logits) > 1 or behaviour_action_log_probs is None:
+        # can't use precalculated values, recompute them. Note that
+        # recomputing won't work well for autoregressive action dists
+        # which may have variables not captured by 'logits'
+        behaviour_action_log_probs = multi_log_probs_from_logits_and_actions(
+            behaviour_policy_logits, actions, dist_class, model
+        )
+
+    behaviour_action_log_probs = convert_to_torch_tensor(
+        behaviour_action_log_probs, device="cpu"
+    )
+    behaviour_action_log_probs = force_list(behaviour_action_log_probs)
+    # log_rhos = target_logp - behavior_logp
+    log_rhos = get_log_rhos(target_action_log_probs, behaviour_action_log_probs)
+
+    vtrace_returns = from_importance_weights(
+        log_rhos=log_rhos,
+        discounts=discounts,
+        rewards=rewards,
+        values=values,
+        bootstrap_value=bootstrap_value,
+        clip_rho_threshold=clip_rho_threshold,
+        clip_pg_rho_threshold=clip_pg_rho_threshold,
+    )
+
+    return VTraceFromLogitsReturns(
+        log_rhos=log_rhos,
+        behaviour_action_log_probs=behaviour_action_log_probs,
+        target_action_log_probs=target_action_log_probs,
+        **vtrace_returns._asdict()
+    )
+
+
+def from_importance_weights(
+    log_rhos,
+    discounts,
+    rewards,
+    values,
+    bootstrap_value,
+    clip_rho_threshold=1.0,
+    clip_pg_rho_threshold=1.0,
+):
+    r"""V-trace from log importance weights.
+
+    Calculates V-trace actor critic targets as described in
+
+    "IMPALA: Scalable Distributed Deep-RL with
+    Importance Weighted Actor-Learner Architectures"
+    by Espeholt, Soyer, Munos et al.
+
+    In the notation used throughout documentation and comments, T refers to the
+    time dimension ranging from 0 to T-1. B refers to the batch size. This code
+    also supports the case where all tensors have the same number of additional
+    dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C],
+    `bootstrap_value` is [B, C].
+
+    Args:
+        log_rhos: A float32 tensor of shape [T, B] representing the log
+            importance sampling weights, i.e.
+            log(target_policy(a) / behaviour_policy(a)). V-trace performs
+            operations on rhos in log-space for numerical stability.
+        discounts: A float32 tensor of shape [T, B] with discounts encountered
+            when following the behaviour policy.
+        rewards: A float32 tensor of shape [T, B] containing rewards generated
+            by following the behaviour policy.
+        values: A float32 tensor of shape [T, B] with the value function
+            estimates wrt. the target policy.
+        bootstrap_value: A float32 of shape [B] with the value function
+            estimate at time T.
+        clip_rho_threshold: A scalar float32 tensor with the clipping threshold
+            for importance weights (rho) when calculating the baseline targets
+            (vs). rho^bar in the paper. If None, no clipping is applied.
+        clip_pg_rho_threshold: A scalar float32 tensor with the clipping
+            threshold on rho_s in
+            \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
+            If None, no clipping is applied.
+
+    Returns:
+        A VTraceReturns namedtuple (vs, pg_advantages) where:
+        vs: A float32 tensor of shape [T, B]. Can be used as target to
+            train a baseline (V(x_t) - vs_t)^2.
+        pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
+            advantage in the calculation of policy gradients.
+    """
+    log_rhos = convert_to_torch_tensor(log_rhos, device="cpu")
+    discounts = convert_to_torch_tensor(discounts, device="cpu")
+    rewards = convert_to_torch_tensor(rewards, device="cpu")
+    values = convert_to_torch_tensor(values, device="cpu")
+    bootstrap_value = convert_to_torch_tensor(bootstrap_value, device="cpu")
+
+    # Make sure tensor ranks are consistent.
+    rho_rank = len(log_rhos.size())  # Usually 2.
+    assert rho_rank == len(values.size())
+    assert rho_rank - 1 == len(bootstrap_value.size()), "must have rank {}".format(
+        rho_rank - 1
+    )
+    assert rho_rank == len(discounts.size())
+    assert rho_rank == len(rewards.size())
+
+    rhos = torch.exp(log_rhos)
+    if clip_rho_threshold is not None:
+        clipped_rhos = torch.clamp_max(rhos, clip_rho_threshold)
+    else:
+        clipped_rhos = rhos
+
+    cs = torch.clamp_max(rhos, 1.0)
+    # Append bootstrapped value to get [v1, ..., v_t+1]
+    values_t_plus_1 = torch.cat(
+        [values[1:], torch.unsqueeze(bootstrap_value, 0)], dim=0
+    )
+    deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)
+
+    vs_minus_v_xs = [torch.zeros_like(bootstrap_value)]
+    for i in reversed(range(len(discounts))):
+        discount_t, c_t, delta_t = discounts[i], cs[i], deltas[i]
+        vs_minus_v_xs.append(delta_t + discount_t * c_t * vs_minus_v_xs[-1])
+    vs_minus_v_xs = torch.stack(vs_minus_v_xs[1:])
+    # Reverse the results back to original order.
+    vs_minus_v_xs = torch.flip(vs_minus_v_xs, dims=[0])
+    # Add V(x_s) to get v_s.
+    vs = vs_minus_v_xs + values
+
+    # Advantage for policy gradient.
+    vs_t_plus_1 = torch.cat([vs[1:], torch.unsqueeze(bootstrap_value, 0)], dim=0)
+    if clip_pg_rho_threshold is not None:
+        clipped_pg_rhos = torch.clamp_max(rhos, clip_pg_rho_threshold)
+    else:
+        clipped_pg_rhos = rhos
+    pg_advantages = clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values)
+
+    # Make sure no gradients backpropagated through the returned values.
+    return VTraceReturns(vs=vs.detach(), pg_advantages=pg_advantages.detach())
+
+
+def get_log_rhos(target_action_log_probs, behaviour_action_log_probs):
+    """With the selected log_probs for multi-discrete actions of behavior
+    and target policies we compute the log_rhos for calculating the vtrace."""
+    t = torch.stack(target_action_log_probs)
+    b = torch.stack(behaviour_action_log_probs)
+    log_rhos = torch.sum(t - b, dim=0)
+    return log_rhos