koichi12 commited on Feb 12, 2025

Commit

b2a7d90

verified ·

1 Parent(s): 1f0374e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/lib/python3.11/site-packages/ray/core/src/ray/raylet/raylet +3 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__init__.py +39 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/algorithm.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/__init__.py +6 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/__pycache__/bc.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/__pycache__/bc_catalog.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/bc.py +120 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/bc_catalog.py +112 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/torch/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/torch/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/torch/default_bc_torch_rl_module.py +45 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__init__.py +10 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/default_dqn_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/distributional_q_tf_model.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_catalog.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_tf_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_torch_model.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_torch_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/default_dqn_rl_module.py +206 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/distributional_q_tf_model.py +190 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn.py +846 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_catalog.py +179 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_learner.py +120 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_tf_policy.py +511 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_torch_model.py +175 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_torch_policy.py +518 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/__pycache__/default_dqn_torch_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/__pycache__/dqn_torch_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/default_dqn_torch_rl_module.py +327 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/dqn_torch_learner.py +295 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__init__.py +18 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/marwil.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/marwil_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/marwil_tf_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/marwil_torch_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil.py +540 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil_learner.py +51 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil_tf_policy.py +251 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil_torch_policy.py +132 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/torch/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/torch/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/torch/__pycache__/marwil_torch_learner.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -102,3 +102,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pip/_vendor/pyparsing/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pip/_vendor/pyparsing/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/core/src/ray/raylet/raylet filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/ray/core/src/ray/raylet/raylet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86e69ec6c72c9778ab73e0bb09c55fcf0c4eb711113ba808476e013c185754be
+size 29047616

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.algorithms.appo.appo import APPO, APPOConfig
+from ray.rllib.algorithms.bc.bc import BC, BCConfig
+from ray.rllib.algorithms.cql.cql import CQL, CQLConfig
+from ray.rllib.algorithms.dqn.dqn import DQN, DQNConfig
+from ray.rllib.algorithms.impala.impala import (
+    IMPALA,
+    IMPALAConfig,
+    Impala,
+    ImpalaConfig,
+)
+from ray.rllib.algorithms.marwil.marwil import MARWIL, MARWILConfig
+from ray.rllib.algorithms.ppo.ppo import PPO, PPOConfig
+from ray.rllib.algorithms.sac.sac import SAC, SACConfig
+__all__ = [
+    "Algorithm",
+    "AlgorithmConfig",
+    "APPO",
+    "APPOConfig",
+    "BC",
+    "BCConfig",
+    "CQL",
+    "CQLConfig",
+    "DQN",
+    "DQNConfig",
+    "IMPALA",
+    "IMPALAConfig",
+    "Impala",
+    "ImpalaConfig",
+    "MARWIL",
+    "MARWILConfig",
+    "PPO",
+    "PPOConfig",
+    "SAC",
+    "SACConfig",
+]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/algorithm.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from ray.rllib.algorithms.bc.bc import BCConfig, BC
+__all__ = [
+    "BC",
+    "BCConfig",
+]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (328 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/__pycache__/bc.cpython-311.pyc ADDED Viewed

Binary file (5.31 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/__pycache__/bc_catalog.cpython-311.pyc ADDED Viewed

Binary file (4.71 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/bc.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.algorithms.marwil.marwil import MARWIL, MARWILConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import RLModuleSpecType
+class BCConfig(MARWILConfig):
+    """Defines a configuration class from which a new BC Algorithm can be built
+    .. testcode::
+        :skipif: True
+        from ray.rllib.algorithms.bc import BCConfig
+        # Run this from the ray directory root.
+        config = BCConfig().training(lr=0.00001, gamma=0.99)
+        config = config.offline_data(
+            input_="./rllib/tests/data/cartpole/large.json")
+        # Build an Algorithm object from the config and run 1 training iteration.
+        algo = config.build()
+        algo.train()
+    .. testcode::
+        :skipif: True
+        from ray.rllib.algorithms.bc import BCConfig
+        from ray import tune
+        config = BCConfig()
+        # Print out some default values.
+        print(config.beta)
+        # Update the config object.
+        config.training(
+            lr=tune.grid_search([0.001, 0.0001]), beta=0.75
+        )
+        # Set the config object's data path.
+        # Run this from the ray directory root.
+        config.offline_data(
+            input_="./rllib/tests/data/cartpole/large.json"
+        )
+        # Set the config object's env, used for evaluation.
+        config.environment(env="CartPole-v1")
+        # Use to_dict() to get the old-style python config dict
+        # when running with tune.
+        tune.Tuner(
+            "BC",
+            param_space=config.to_dict(),
+        ).fit()
+    """
+    def __init__(self, algo_class=None):
+        super().__init__(algo_class=algo_class or BC)
+        # fmt: off
+        # __sphinx_doc_begin__
+        # No need to calculate advantages (or do anything else with the rewards).
+        self.beta = 0.0
+        # Advantages (calculated during postprocessing)
+        # not important for behavioral cloning.
+        self.postprocess_inputs = False
+        # Materialize only the mapped data. This is optimal as long
+        # as no connector in the connector pipeline holds a state.
+        self.materialize_data = False
+        self.materialize_mapped_data = True
+        # __sphinx_doc_end__
+        # fmt: on
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self) -> RLModuleSpecType:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.bc.torch.default_bc_torch_rl_module import (
+                DefaultBCTorchRLModule,
+            )
+            return RLModuleSpec(module_class=DefaultBCTorchRLModule)
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use `torch` instead."
+            )
+    @override(AlgorithmConfig)
+    def build_learner_connector(
+        self,
+        input_observation_space,
+        input_action_space,
+        device=None,
+    ):
+        pipeline = super().build_learner_connector(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            device=device,
+        )
+        # Remove unneeded connectors from the MARWIL connector pipeline.
+        pipeline.remove("AddOneTsToEpisodesAndTruncate")
+        pipeline.remove("GeneralAdvantageEstimation")
+        return pipeline
+    @override(MARWILConfig)
+    def validate(self) -> None:
+        # Call super's validation method.
+        super().validate()
+        if self.beta != 0.0:
+            self._value_error("For behavioral cloning, `beta` parameter must be 0.0!")
+class BC(MARWIL):
+    """Behavioral Cloning (derived from MARWIL).
+    Uses MARWIL with beta force-set to 0.0.
+    """
+    @classmethod
+    @override(MARWIL)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return BCConfig()

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/bc_catalog.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# __sphinx_doc_begin__
+import gymnasium as gym
+from ray.rllib.algorithms.ppo.ppo_catalog import _check_if_diag_gaussian
+from ray.rllib.core.models.catalog import Catalog
+from ray.rllib.core.models.configs import FreeLogStdMLPHeadConfig, MLPHeadConfig
+from ray.rllib.core.models.base import Model
+from ray.rllib.utils.annotations import OverrideToImplementCustomLogic
+class BCCatalog(Catalog):
+    """The Catalog class used to build models for BC.
+    BCCatalog provides the following models:
+        - Encoder: The encoder used to encode the observations.
+        - Pi Head: The head used for the policy logits.
+    The default encoder is chosen by RLlib dependent on the observation space.
+    See `ray.rllib.core.models.encoders::Encoder` for details. To define the
+    network architecture use the `model_config_dict[fcnet_hiddens]` and
+    `model_config_dict[fcnet_activation]`.
+    To implement custom logic, override `BCCatalog.build_encoder()` or modify the
+    `EncoderConfig` at `BCCatalog.encoder_config`.
+    Any custom head can be built by overriding the `build_pi_head()` method.
+    Alternatively, the `PiHeadConfig` can be overridden to build a custom
+    policy head during runtime. To change solely the network architecture,
+    `model_config_dict["head_fcnet_hiddens"]` and
+    `model_config_dict["head_fcnet_activation"]` can be used.
+    """
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        model_config_dict: dict,
+    ):
+        """Initializes the BCCatalog.
+        Args:
+            observation_space: The observation space if the Encoder.
+            action_space: The action space for the Pi Head.
+            model_cnfig_dict: The model config to use..
+        """
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+            model_config_dict=model_config_dict,
+        )
+        self.pi_head_hiddens = self._model_config_dict["head_fcnet_hiddens"]
+        self.pi_head_activation = self._model_config_dict["head_fcnet_activation"]
+        # At this time we do not have the precise (framework-specific) action
+        # distribution class, i.e. we do  not know the output dimension of the
+        # policy head. The config for the policy head is therefore build in the
+        # `self.build_pi_head()` method.
+        self.pi_head_config = None
+    @OverrideToImplementCustomLogic
+    def build_pi_head(self, framework: str) -> Model:
+        """Builds the policy head.
+        The default behavior is to build the head from the pi_head_config.
+        This can be overridden to build a custom policy head as a means of configuring
+        the behavior of a BC specific RLModule implementation.
+        Args:
+            framework: The framework to use. Either "torch" or "tf2".
+        Returns:
+            The policy head.
+        """
+        # Define the output dimension via the action distribution.
+        action_distribution_cls = self.get_action_dist_cls(framework=framework)
+        if self._model_config_dict["free_log_std"]:
+            _check_if_diag_gaussian(
+                action_distribution_cls=action_distribution_cls, framework=framework
+            )
+            is_diag_gaussian = True
+        else:
+            is_diag_gaussian = _check_if_diag_gaussian(
+                action_distribution_cls=action_distribution_cls,
+                framework=framework,
+                no_error=True,
+            )
+        required_output_dim = action_distribution_cls.required_input_dim(
+            space=self.action_space, model_config=self._model_config_dict
+        )
+        # With the action distribution class and the number of outputs defined,
+        # we can build the config for the policy head.
+        pi_head_config_cls = (
+            FreeLogStdMLPHeadConfig
+            if self._model_config_dict["free_log_std"]
+            else MLPHeadConfig
+        )
+        self.pi_head_config = pi_head_config_cls(
+            input_dims=self._latent_dims,
+            hidden_layer_dims=self.pi_head_hiddens,
+            hidden_layer_activation=self.pi_head_activation,
+            output_layer_dim=required_output_dim,
+            output_layer_activation="linear",
+            clip_log_std=is_diag_gaussian,
+            log_std_clip_param=self._model_config_dict.get("log_std_clip_param", 20),
+        )
+        return self.pi_head_config.build(framework=framework)
+# __sphinx_doc_end__

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/torch/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/torch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/torch/default_bc_torch_rl_module.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import abc
+from typing import Any, Dict
+from ray.rllib.algorithms.bc.bc_catalog import BCCatalog
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.models.base import ENCODER_OUT
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
+from ray.rllib.utils.annotations import override
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+class DefaultBCTorchRLModule(TorchRLModule, abc.ABC):
+    """The default TorchRLModule used, if no custom RLModule is provided.
+    Builds an encoder net based on the observation space.
+    Builds a pi head based on the action space.
+    Passes observations from the input batch through the encoder, then the pi head to
+    compute action logits.
+    """
+    def __init__(self, *args, **kwargs):
+        catalog_class = kwargs.pop("catalog_class", None)
+        if catalog_class is None:
+            catalog_class = BCCatalog
+        super().__init__(*args, **kwargs, catalog_class=catalog_class)
+    @override(RLModule)
+    def setup(self):
+        # Build model components (encoder and pi head) from catalog.
+        super().setup()
+        self._encoder = self.catalog.build_encoder(framework=self.framework)
+        self._pi_head = self.catalog.build_pi_head(framework=self.framework)
+    @override(TorchRLModule)
+    def _forward(self, batch: Dict, **kwargs) -> Dict[str, Any]:
+        """Generic BC forward pass (for all phases of training/evaluation)."""
+        # Encoder embeddings.
+        encoder_outs = self._encoder(batch)
+        # Action dist inputs.
+        return {
+            Columns.ACTION_DIST_INPUTS: self._pi_head(encoder_outs[ENCODER_OUT]),
+        }

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ray.rllib.algorithms.dqn.dqn import DQN, DQNConfig
+from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy
+from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy
+__all__ = [
+    "DQN",
+    "DQNConfig",
+    "DQNTFPolicy",
+    "DQNTorchPolicy",
+]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (533 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/default_dqn_rl_module.cpython-311.pyc ADDED Viewed

Binary file (10.3 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/distributional_q_tf_model.cpython-311.pyc ADDED Viewed

Binary file (10.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn.cpython-311.pyc ADDED Viewed

Binary file (36.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_catalog.cpython-311.pyc ADDED Viewed

Binary file (7.63 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_learner.cpython-311.pyc ADDED Viewed

Binary file (6.29 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_tf_policy.cpython-311.pyc ADDED Viewed

Binary file (21.8 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_torch_model.cpython-311.pyc ADDED Viewed

Binary file (8.19 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/__pycache__/dqn_torch_policy.cpython-311.pyc ADDED Viewed

Binary file (20.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/default_dqn_rl_module.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import abc
+from typing import Any, Dict, List, Tuple, Union
+from ray.rllib.algorithms.sac.sac_learner import QF_PREDS
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.utils import make_target_network
+from ray.rllib.core.models.base import Encoder, Model
+from ray.rllib.core.models.specs.typing import SpecType
+from ray.rllib.core.rl_module.apis import QNetAPI, InferenceOnlyAPI, TargetNetworkAPI
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic,
+)
+from ray.rllib.utils.schedules.scheduler import Scheduler
+from ray.rllib.utils.typing import NetworkType, TensorType
+from ray.util.annotations import DeveloperAPI
+ATOMS = "atoms"
+QF_LOGITS = "qf_logits"
+QF_NEXT_PREDS = "qf_next_preds"
+QF_PROBS = "qf_probs"
+QF_TARGET_NEXT_PREDS = "qf_target_next_preds"
+QF_TARGET_NEXT_PROBS = "qf_target_next_probs"
+@DeveloperAPI
+class DefaultDQNRLModule(RLModule, InferenceOnlyAPI, TargetNetworkAPI, QNetAPI):
+    @override(RLModule)
+    def setup(self):
+        # If a dueling architecture is used.
+        self.uses_dueling: bool = self.model_config.get("dueling")
+        # If double Q learning is used.
+        self.uses_double_q: bool = self.model_config.get("double_q")
+        # The number of atoms for a distribution support.
+        self.num_atoms: int = self.model_config.get("num_atoms")
+        # If distributional learning is requested configure the support.
+        if self.num_atoms > 1:
+            self.v_min: float = self.model_config.get("v_min")
+            self.v_max: float = self.model_config.get("v_max")
+        # The epsilon scheduler for epsilon greedy exploration.
+        self.epsilon_schedule = Scheduler(
+            fixed_value_or_schedule=self.model_config["epsilon"],
+            framework=self.framework,
+        )
+        # Build the encoder for the advantage and value streams. Note,
+        # the same encoder is used.
+        # Note further, by using the base encoder the correct encoder
+        # is chosen for the observation space used.
+        self.encoder = self.catalog.build_encoder(framework=self.framework)
+        # Build heads.
+        self.af = self.catalog.build_af_head(framework=self.framework)
+        if self.uses_dueling:
+            # If in a dueling setting setup the value function head.
+            self.vf = self.catalog.build_vf_head(framework=self.framework)
+    @override(InferenceOnlyAPI)
+    def get_non_inference_attributes(self) -> List[str]:
+        return ["_target_encoder", "_target_af"] + (
+            ["_target_vf"] if self.uses_dueling else []
+        )
+    @override(TargetNetworkAPI)
+    def make_target_networks(self) -> None:
+        self._target_encoder = make_target_network(self.encoder)
+        self._target_af = make_target_network(self.af)
+        if self.uses_dueling:
+            self._target_vf = make_target_network(self.vf)
+    @override(TargetNetworkAPI)
+    def get_target_network_pairs(self) -> List[Tuple[NetworkType, NetworkType]]:
+        return [(self.encoder, self._target_encoder), (self.af, self._target_af)] + (
+            # If we have a dueling architecture we need to update the value stream
+            # target, too.
+            [
+                (self.vf, self._target_vf),
+            ]
+            if self.uses_dueling
+            else []
+        )
+    @override(TargetNetworkAPI)
+    def forward_target(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        """Computes Q-values from the target network.
+        Note, these can be accompanied by logits and probabilities
+        in case of distributional Q-learning, i.e. `self.num_atoms > 1`.
+        Args:
+            batch: The batch received in the forward pass.
+        Results:
+            A dictionary containing the target Q-value predictions ("qf_preds")
+            and in case of distributional Q-learning in addition to the target
+            Q-value predictions ("qf_preds") the support atoms ("atoms"), the target
+            Q-logits  ("qf_logits"), and the probabilities ("qf_probs").
+        """
+        # If we have a dueling architecture we have to add the value stream.
+        return self._qf_forward_helper(
+            batch,
+            self._target_encoder,
+            (
+                {"af": self._target_af, "vf": self._target_vf}
+                if self.uses_dueling
+                else self._target_af
+            ),
+        )
+    @override(QNetAPI)
+    def compute_q_values(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]:
+        """Computes Q-values, given encoder, q-net and (optionally), advantage net.
+        Note, these can be accompanied by logits and probabilities
+        in case of distributional Q-learning, i.e. `self.num_atoms > 1`.
+        Args:
+            batch: The batch received in the forward pass.
+        Results:
+            A dictionary containing the Q-value predictions ("qf_preds")
+            and in case of distributional Q-learning - in addition to the Q-value
+            predictions ("qf_preds") - the support atoms ("atoms"), the Q-logits
+            ("qf_logits"), and the probabilities ("qf_probs").
+        """
+        # If we have a dueling architecture we have to add the value stream.
+        return self._qf_forward_helper(
+            batch,
+            self.encoder,
+            {"af": self.af, "vf": self.vf} if self.uses_dueling else self.af,
+        )
+    @override(RLModule)
+    def get_initial_state(self) -> dict:
+        if hasattr(self.encoder, "get_initial_state"):
+            return self.encoder.get_initial_state()
+        else:
+            return {}
+    @override(RLModule)
+    def input_specs_train(self) -> SpecType:
+        return [
+            Columns.OBS,
+            Columns.ACTIONS,
+            Columns.NEXT_OBS,
+        ]
+    @override(RLModule)
+    def output_specs_exploration(self) -> SpecType:
+        return [Columns.ACTIONS]
+    @override(RLModule)
+    def output_specs_inference(self) -> SpecType:
+        return [Columns.ACTIONS]
+    @override(RLModule)
+    def output_specs_train(self) -> SpecType:
+        return [
+            QF_PREDS,
+            QF_TARGET_NEXT_PREDS,
+            # Add keys for double-Q setup.
+            *([QF_NEXT_PREDS] if self.uses_double_q else []),
+            # Add keys for distributional Q-learning.
+            *(
+                [
+                    ATOMS,
+                    QF_LOGITS,
+                    QF_PROBS,
+                    QF_TARGET_NEXT_PROBS,
+                ]
+                # We add these keys only when learning a distribution.
+                if self.num_atoms > 1
+                else []
+            ),
+        ]
+    @abc.abstractmethod
+    @OverrideToImplementCustomLogic
+    def _qf_forward_helper(
+        self,
+        batch: Dict[str, TensorType],
+        encoder: Encoder,
+        head: Union[Model, Dict[str, Model]],
+    ) -> Dict[str, TensorType]:
+        """Computes Q-values.
+        This is a helper function that takes care of all different cases,
+        i.e. if we use a dueling architecture or not and if we use distributional
+        Q-learning or not.
+        Args:
+            batch: The batch received in the forward pass.
+            encoder: The encoder network to use. Here we have a single encoder
+                for all heads (Q or advantages and value in case of a dueling
+                architecture).
+            head: Either a head model or a dictionary of head model (dueling
+            architecture) containing advantage and value stream heads.
+        Returns:
+            In case of expectation learning the Q-value predictions ("qf_preds")
+            and in case of distributional Q-learning in addition to the predictions
+            the atoms ("atoms"), the Q-value predictions ("qf_preds"), the Q-logits
+            ("qf_logits") and the probabilities for the support atoms ("qf_probs").
+        """

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/distributional_q_tf_model.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""Tensorflow model for DQN"""
+from typing import List
+import gymnasium as gym
+from ray.rllib.models.tf.layers import NoisyLayer
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.typing import ModelConfigDict, TensorType
+tf1, tf, tfv = try_import_tf()
+@OldAPIStack
+class DistributionalQTFModel(TFModelV2):
+    """Extension of standard TFModel to provide distributional Q values.
+    It also supports options for noisy nets and parameter space noise.
+    Data flow:
+        obs -> forward() -> model_out
+        model_out -> get_q_value_distributions() -> Q(s, a) atoms
+        model_out -> get_state_value() -> V(s)
+    Note that this class by itself is not a valid model unless you
+    implement forward() in a subclass."""
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+        q_hiddens=(256,),
+        dueling: bool = False,
+        num_atoms: int = 1,
+        use_noisy: bool = False,
+        v_min: float = -10.0,
+        v_max: float = 10.0,
+        sigma0: float = 0.5,
+        # TODO(sven): Move `add_layer_norm` into ModelCatalog as
+        #  generic option, then error if we use ParameterNoise as
+        #  Exploration type and do not have any LayerNorm layers in
+        #  the net.
+        add_layer_norm: bool = False,
+    ):
+        """Initialize variables of this model.
+        Extra model kwargs:
+            q_hiddens (List[int]): List of layer-sizes after(!) the
+                Advantages(A)/Value(V)-split. Hence, each of the A- and V-
+                branches will have this structure of Dense layers. To define
+                the NN before this A/V-split, use - as always -
+                config["model"]["fcnet_hiddens"].
+            dueling: Whether to build the advantage(A)/value(V) heads
+                for DDQN. If True, Q-values are calculated as:
+                Q = (A - mean[A]) + V. If False, raw NN output is interpreted
+                as Q-values.
+            num_atoms: If >1, enables distributional DQN.
+            use_noisy: Use noisy nets.
+            v_min: Min value support for distributional DQN.
+            v_max: Max value support for distributional DQN.
+            sigma0 (float): Initial value of noisy layers.
+            add_layer_norm: Enable layer norm (for param noise).
+        Note that the core layers for forward() are not defined here, this
+        only defines the layers for the Q head. Those layers for forward()
+        should be defined in subclasses of DistributionalQModel.
+        """
+        super(DistributionalQTFModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+        # setup the Q head output (i.e., model for get_q_values)
+        self.model_out = tf.keras.layers.Input(shape=(num_outputs,), name="model_out")
+        def build_action_value(prefix: str, model_out: TensorType) -> List[TensorType]:
+            if q_hiddens:
+                action_out = model_out
+                for i in range(len(q_hiddens)):
+                    if use_noisy:
+                        action_out = NoisyLayer(
+                            "{}hidden_{}".format(prefix, i), q_hiddens[i], sigma0
+                        )(action_out)
+                    elif add_layer_norm:
+                        action_out = tf.keras.layers.Dense(
+                            units=q_hiddens[i], activation=tf.nn.relu
+                        )(action_out)
+                        action_out = tf.keras.layers.LayerNormalization()(action_out)
+                    else:
+                        action_out = tf.keras.layers.Dense(
+                            units=q_hiddens[i],
+                            activation=tf.nn.relu,
+                            name="hidden_%d" % i,
+                        )(action_out)
+            else:
+                # Avoid postprocessing the outputs. This enables custom models
+                # to be used for parametric action DQN.
+                action_out = model_out
+            if use_noisy:
+                action_scores = NoisyLayer(
+                    "{}output".format(prefix),
+                    self.action_space.n * num_atoms,
+                    sigma0,
+                    activation=None,
+                )(action_out)
+            elif q_hiddens:
+                action_scores = tf.keras.layers.Dense(
+                    units=self.action_space.n * num_atoms, activation=None
+                )(action_out)
+            else:
+                action_scores = model_out
+            if num_atoms > 1:
+                # Distributional Q-learning uses a discrete support z
+                # to represent the action value distribution
+                z = tf.range(num_atoms, dtype=tf.float32)
+                z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
+                def _layer(x):
+                    support_logits_per_action = tf.reshape(
+                        tensor=x, shape=(-1, self.action_space.n, num_atoms)
+                    )
+                    support_prob_per_action = tf.nn.softmax(
+                        logits=support_logits_per_action
+                    )
+                    x = tf.reduce_sum(input_tensor=z * support_prob_per_action, axis=-1)
+                    logits = support_logits_per_action
+                    dist = support_prob_per_action
+                    return [x, z, support_logits_per_action, logits, dist]
+                return tf.keras.layers.Lambda(_layer)(action_scores)
+            else:
+                logits = tf.expand_dims(tf.ones_like(action_scores), -1)
+                dist = tf.expand_dims(tf.ones_like(action_scores), -1)
+                return [action_scores, logits, dist]
+        def build_state_score(prefix: str, model_out: TensorType) -> TensorType:
+            state_out = model_out
+            for i in range(len(q_hiddens)):
+                if use_noisy:
+                    state_out = NoisyLayer(
+                        "{}dueling_hidden_{}".format(prefix, i), q_hiddens[i], sigma0
+                    )(state_out)
+                else:
+                    state_out = tf.keras.layers.Dense(
+                        units=q_hiddens[i], activation=tf.nn.relu
+                    )(state_out)
+                    if add_layer_norm:
+                        state_out = tf.keras.layers.LayerNormalization()(state_out)
+            if use_noisy:
+                state_score = NoisyLayer(
+                    "{}dueling_output".format(prefix),
+                    num_atoms,
+                    sigma0,
+                    activation=None,
+                )(state_out)
+            else:
+                state_score = tf.keras.layers.Dense(units=num_atoms, activation=None)(
+                    state_out
+                )
+            return state_score
+        q_out = build_action_value(name + "/action_value/", self.model_out)
+        self.q_value_head = tf.keras.Model(self.model_out, q_out)
+        if dueling:
+            state_out = build_state_score(name + "/state_value/", self.model_out)
+            self.state_value_head = tf.keras.Model(self.model_out, state_out)
+    def get_q_value_distributions(self, model_out: TensorType) -> List[TensorType]:
+        """Returns distributional values for Q(s, a) given a state embedding.
+        Override this in your custom model to customize the Q output head.
+        Args:
+            model_out: embedding from the model layers
+        Returns:
+            (action_scores, logits, dist) if num_atoms == 1, otherwise
+            (action_scores, z, support_logits_per_action, logits, dist)
+        """
+        return self.q_value_head(model_out)
+    def get_state_value(self, model_out: TensorType) -> TensorType:
+        """Returns the state value prediction for the given state embedding."""
+        return self.state_value_head(model_out)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn.py ADDED Viewed

	@@ -0,0 +1,846 @@

+"""
+Deep Q-Networks (DQN, Rainbow, Parametric DQN)
+==============================================
+This file defines the distributed Algorithm class for the Deep Q-Networks
+algorithm. See `dqn_[tf|torch]_policy.py` for the definition of the policies.
+Detailed documentation:
+https://docs.ray.io/en/master/rllib-algorithms.html#deep-q-networks-dqn-rainbow-parametric-dqn
+"""  # noqa: E501
+from collections import defaultdict
+import logging
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+import numpy as np
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy
+from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy
+from ray.rllib.core.learner import Learner
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.execution.rollout_ops import (
+    synchronous_parallel_sample,
+)
+from ray.rllib.policy.sample_batch import MultiAgentBatch
+from ray.rllib.execution.train_ops import (
+    train_one_step,
+    multi_gpu_train_one_step,
+)
+from ray.rllib.policy.policy import Policy
+from ray.rllib.utils import deep_update
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.replay_buffers.utils import (
+    update_priorities_in_episode_replay_buffer,
+    update_priorities_in_replay_buffer,
+    validate_buffer_config,
+)
+from ray.rllib.utils.typing import ResultDict
+from ray.rllib.utils.metrics import (
+    ALL_MODULES,
+    ENV_RUNNER_RESULTS,
+    ENV_RUNNER_SAMPLING_TIMER,
+    LAST_TARGET_UPDATE_TS,
+    LEARNER_RESULTS,
+    LEARNER_UPDATE_TIMER,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_AGENT_STEPS_SAMPLED_LIFETIME,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    NUM_TARGET_UPDATES,
+    REPLAY_BUFFER_ADD_DATA_TIMER,
+    REPLAY_BUFFER_RESULTS,
+    REPLAY_BUFFER_SAMPLE_TIMER,
+    REPLAY_BUFFER_UPDATE_PRIOS_TIMER,
+    SAMPLE_TIMER,
+    SYNCH_WORKER_WEIGHTS_TIMER,
+    TD_ERROR_KEY,
+    TIMERS,
+)
+from ray.rllib.utils.deprecation import DEPRECATED_VALUE
+from ray.rllib.utils.replay_buffers.utils import sample_min_n_steps_from_buffer
+from ray.rllib.utils.typing import (
+    LearningRateOrSchedule,
+    RLModuleSpecType,
+    SampleBatchType,
+)
+logger = logging.getLogger(__name__)
+class DQNConfig(AlgorithmConfig):
+    r"""Defines a configuration class from which a DQN Algorithm can be built.
+    .. testcode::
+        from ray.rllib.algorithms.dqn.dqn import DQNConfig
+        config = (
+            DQNConfig()
+            .environment("CartPole-v1")
+            .training(replay_buffer_config={
+                "type": "PrioritizedEpisodeReplayBuffer",
+                "capacity": 60000,
+                "alpha": 0.5,
+                "beta": 0.5,
+            })
+            .env_runners(num_env_runners=1)
+        )
+        algo = config.build()
+        algo.train()
+        algo.stop()
+    .. testcode::
+        from ray.rllib.algorithms.dqn.dqn import DQNConfig
+        from ray import air
+        from ray import tune
+        config = (
+            DQNConfig()
+            .environment("CartPole-v1")
+            .training(
+                num_atoms=tune.grid_search([1,])
+            )
+        )
+        tune.Tuner(
+            "DQN",
+            run_config=air.RunConfig(stop={"training_iteration":1}),
+            param_space=config,
+        ).fit()
+    .. testoutput::
+        :hide:
+        ...
+    """
+    def __init__(self, algo_class=None):
+        """Initializes a DQNConfig instance."""
+        self.exploration_config = {
+            "type": "EpsilonGreedy",
+            "initial_epsilon": 1.0,
+            "final_epsilon": 0.02,
+            "epsilon_timesteps": 10000,
+        }
+        super().__init__(algo_class=algo_class or DQN)
+        # Overrides of AlgorithmConfig defaults
+        # `env_runners()`
+        # Set to `self.n_step`, if 'auto'.
+        self.rollout_fragment_length: Union[int, str] = "auto"
+        # New stack uses `epsilon` as either a constant value or a scheduler
+        # defined like this.
+        # TODO (simon): Ensure that users can understand how to provide epsilon.
+        #  (sven): Should we add this to `self.env_runners(epsilon=..)`?
+        self.epsilon = [(0, 1.0), (10000, 0.05)]
+        # `training()`
+        self.grad_clip = 40.0
+        # Note: Only when using enable_rl_module_and_learner=True can the clipping mode
+        # be configured by the user. On the old API stack, RLlib will always clip by
+        # global_norm, no matter the value of `grad_clip_by`.
+        self.grad_clip_by = "global_norm"
+        self.lr = 5e-4
+        self.train_batch_size = 32
+        # `evaluation()`
+        self.evaluation(evaluation_config=AlgorithmConfig.overrides(explore=False))
+        # `reporting()`
+        self.min_time_s_per_iteration = None
+        self.min_sample_timesteps_per_iteration = 1000
+        # DQN specific config settings.
+        # fmt: off
+        # __sphinx_doc_begin__
+        self.target_network_update_freq = 500
+        self.num_steps_sampled_before_learning_starts = 1000
+        self.store_buffer_in_checkpoints = False
+        self.adam_epsilon = 1e-8
+        self.tau = 1.0
+        self.num_atoms = 1
+        self.v_min = -10.0
+        self.v_max = 10.0
+        self.noisy = False
+        self.sigma0 = 0.5
+        self.dueling = True
+        self.hiddens = [256]
+        self.double_q = True
+        self.n_step = 1
+        self.before_learn_on_batch = None
+        self.training_intensity = None
+        self.td_error_loss_fn = "huber"
+        self.categorical_distribution_temperature = 1.0
+        # The burn-in for stateful `RLModule`s.
+        self.burn_in_len = 0
+        # Replay buffer configuration.
+        self.replay_buffer_config = {
+            "type": "PrioritizedEpisodeReplayBuffer",
+            # Size of the replay buffer. Note that if async_updates is set,
+            # then each worker will have a replay buffer of this size.
+            "capacity": 50000,
+            "alpha": 0.6,
+            # Beta parameter for sampling from prioritized replay buffer.
+            "beta": 0.4,
+        }
+        # fmt: on
+        # __sphinx_doc_end__
+        self.lr_schedule = None  # @OldAPIStack
+        # Deprecated
+        self.buffer_size = DEPRECATED_VALUE
+        self.prioritized_replay = DEPRECATED_VALUE
+        self.learning_starts = DEPRECATED_VALUE
+        self.replay_batch_size = DEPRECATED_VALUE
+        # Can not use DEPRECATED_VALUE here because -1 is a common config value
+        self.replay_sequence_length = None
+        self.prioritized_replay_alpha = DEPRECATED_VALUE
+        self.prioritized_replay_beta = DEPRECATED_VALUE
+        self.prioritized_replay_eps = DEPRECATED_VALUE
+    @override(AlgorithmConfig)
+    def training(
+        self,
+        *,
+        target_network_update_freq: Optional[int] = NotProvided,
+        replay_buffer_config: Optional[dict] = NotProvided,
+        store_buffer_in_checkpoints: Optional[bool] = NotProvided,
+        lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided,
+        epsilon: Optional[LearningRateOrSchedule] = NotProvided,
+        adam_epsilon: Optional[float] = NotProvided,
+        grad_clip: Optional[int] = NotProvided,
+        num_steps_sampled_before_learning_starts: Optional[int] = NotProvided,
+        tau: Optional[float] = NotProvided,
+        num_atoms: Optional[int] = NotProvided,
+        v_min: Optional[float] = NotProvided,
+        v_max: Optional[float] = NotProvided,
+        noisy: Optional[bool] = NotProvided,
+        sigma0: Optional[float] = NotProvided,
+        dueling: Optional[bool] = NotProvided,
+        hiddens: Optional[int] = NotProvided,
+        double_q: Optional[bool] = NotProvided,
+        n_step: Optional[Union[int, Tuple[int, int]]] = NotProvided,
+        before_learn_on_batch: Callable[
+            [Type[MultiAgentBatch], List[Type[Policy]], Type[int]],
+            Type[MultiAgentBatch],
+        ] = NotProvided,
+        training_intensity: Optional[float] = NotProvided,
+        td_error_loss_fn: Optional[str] = NotProvided,
+        categorical_distribution_temperature: Optional[float] = NotProvided,
+        burn_in_len: Optional[int] = NotProvided,
+        **kwargs,
+    ) -> "DQNConfig":
+        """Sets the training related configuration.
+        Args:
+            target_network_update_freq: Update the target network every
+                `target_network_update_freq` sample steps.
+            replay_buffer_config: Replay buffer config.
+                Examples:
+                {
+                "_enable_replay_buffer_api": True,
+                "type": "MultiAgentReplayBuffer",
+                "capacity": 50000,
+                "replay_sequence_length": 1,
+                }
+                - OR -
+                {
+                "_enable_replay_buffer_api": True,
+                "type": "MultiAgentPrioritizedReplayBuffer",
+                "capacity": 50000,
+                "prioritized_replay_alpha": 0.6,
+                "prioritized_replay_beta": 0.4,
+                "prioritized_replay_eps": 1e-6,
+                "replay_sequence_length": 1,
+                }
+                - Where -
+                prioritized_replay_alpha: Alpha parameter controls the degree of
+                prioritization in the buffer. In other words, when a buffer sample has
+                a higher temporal-difference error, with how much more probability
+                should it drawn to use to update the parametrized Q-network. 0.0
+                corresponds to uniform probability. Setting much above 1.0 may quickly
+                result as the sampling distribution could become heavily “pointy” with
+                low entropy.
+                prioritized_replay_beta: Beta parameter controls the degree of
+                importance sampling which suppresses the influence of gradient updates
+                from samples that have higher probability of being sampled via alpha
+                parameter and the temporal-difference error.
+                prioritized_replay_eps: Epsilon parameter sets the baseline probability
+                for sampling so that when the temporal-difference error of a sample is
+                zero, there is still a chance of drawing the sample.
+            store_buffer_in_checkpoints: Set this to True, if you want the contents of
+                your buffer(s) to be stored in any saved checkpoints as well.
+                Warnings will be created if:
+                - This is True AND restoring from a checkpoint that contains no buffer
+                data.
+                - This is False AND restoring from a checkpoint that does contain
+                buffer data.
+            epsilon: Epsilon exploration schedule. In the format of [[timestep, value],
+                [timestep, value], ...]. A schedule must start from
+                timestep 0.
+            adam_epsilon: Adam optimizer's epsilon hyper parameter.
+            grad_clip: If not None, clip gradients during optimization at this value.
+            num_steps_sampled_before_learning_starts: Number of timesteps to collect
+                from rollout workers before we start sampling from replay buffers for
+                learning. Whether we count this in agent steps or environment steps
+                depends on config.multi_agent(count_steps_by=..).
+            tau: Update the target by \tau * policy + (1-\tau) * target_policy.
+            num_atoms: Number of atoms for representing the distribution of return.
+                When this is greater than 1, distributional Q-learning is used.
+            v_min: Minimum value estimation
+            v_max: Maximum value estimation
+            noisy: Whether to use noisy network to aid exploration. This adds parametric
+                noise to the model weights.
+            sigma0: Control the initial parameter noise for noisy nets.
+            dueling: Whether to use dueling DQN.
+            hiddens: Dense-layer setup for each the advantage branch and the value
+                branch
+            double_q: Whether to use double DQN.
+            n_step: N-step target updates. If >1, sars' tuples in trajectories will be
+                postprocessed to become sa[discounted sum of R][s t+n] tuples. An
+                integer will be interpreted as a fixed n-step value. If a tuple of 2
+                ints is provided here, the n-step value will be drawn for each sample(!)
+                in the train batch from a uniform distribution over the closed interval
+                defined by `[n_step[0], n_step[1]]`.
+            before_learn_on_batch: Callback to run before learning on a multi-agent
+                batch of experiences.
+            training_intensity: The intensity with which to update the model (vs
+                collecting samples from the env).
+                If None, uses "natural" values of:
+                `train_batch_size` / (`rollout_fragment_length` x `num_env_runners` x
+                `num_envs_per_env_runner`).
+                If not None, will make sure that the ratio between timesteps inserted
+                into and sampled from the buffer matches the given values.
+                Example:
+                training_intensity=1000.0
+                train_batch_size=250
+                rollout_fragment_length=1
+                num_env_runners=1 (or 0)
+                num_envs_per_env_runner=1
+                -> natural value = 250 / 1 = 250.0
+                -> will make sure that replay+train op will be executed 4x asoften as
+                rollout+insert op (4 * 250 = 1000).
+                See: rllib/algorithms/dqn/dqn.py::calculate_rr_weights for further
+                details.
+            td_error_loss_fn: "huber" or "mse". loss function for calculating TD error
+                when num_atoms is 1. Note that if num_atoms is > 1, this parameter
+                is simply ignored, and softmax cross entropy loss will be used.
+            categorical_distribution_temperature: Set the temperature parameter used
+                by Categorical action distribution. A valid temperature is in the range
+                of [0, 1]. Note that this mostly affects evaluation since TD error uses
+                argmax for return calculation.
+            burn_in_len: The burn-in period for a stateful RLModule. It allows the
+                Learner to utilize the initial `burn_in_len` steps in a replay sequence
+                solely for unrolling the network and establishing a typical starting
+                state. The network is then updated on the remaining steps of the
+                sequence. This process helps mitigate issues stemming from a poor
+                initial state - zero or an outdated recorded state. Consider setting
+                this parameter to a positive integer if your stateful RLModule faces
+                convergence challenges or exhibits signs of catastrophic forgetting.
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+        if target_network_update_freq is not NotProvided:
+            self.target_network_update_freq = target_network_update_freq
+        if replay_buffer_config is not NotProvided:
+            # Override entire `replay_buffer_config` if `type` key changes.
+            # Update, if `type` key remains the same or is not specified.
+            new_replay_buffer_config = deep_update(
+                {"replay_buffer_config": self.replay_buffer_config},
+                {"replay_buffer_config": replay_buffer_config},
+                False,
+                ["replay_buffer_config"],
+                ["replay_buffer_config"],
+            )
+            self.replay_buffer_config = new_replay_buffer_config["replay_buffer_config"]
+        if store_buffer_in_checkpoints is not NotProvided:
+            self.store_buffer_in_checkpoints = store_buffer_in_checkpoints
+        if lr_schedule is not NotProvided:
+            self.lr_schedule = lr_schedule
+        if epsilon is not NotProvided:
+            self.epsilon = epsilon
+        if adam_epsilon is not NotProvided:
+            self.adam_epsilon = adam_epsilon
+        if grad_clip is not NotProvided:
+            self.grad_clip = grad_clip
+        if num_steps_sampled_before_learning_starts is not NotProvided:
+            self.num_steps_sampled_before_learning_starts = (
+                num_steps_sampled_before_learning_starts
+            )
+        if tau is not NotProvided:
+            self.tau = tau
+        if num_atoms is not NotProvided:
+            self.num_atoms = num_atoms
+        if v_min is not NotProvided:
+            self.v_min = v_min
+        if v_max is not NotProvided:
+            self.v_max = v_max
+        if noisy is not NotProvided:
+            self.noisy = noisy
+        if sigma0 is not NotProvided:
+            self.sigma0 = sigma0
+        if dueling is not NotProvided:
+            self.dueling = dueling
+        if hiddens is not NotProvided:
+            self.hiddens = hiddens
+        if double_q is not NotProvided:
+            self.double_q = double_q
+        if n_step is not NotProvided:
+            self.n_step = n_step
+        if before_learn_on_batch is not NotProvided:
+            self.before_learn_on_batch = before_learn_on_batch
+        if training_intensity is not NotProvided:
+            self.training_intensity = training_intensity
+        if td_error_loss_fn is not NotProvided:
+            self.td_error_loss_fn = td_error_loss_fn
+        if categorical_distribution_temperature is not NotProvided:
+            self.categorical_distribution_temperature = (
+                categorical_distribution_temperature
+            )
+        if burn_in_len is not NotProvided:
+            self.burn_in_len = burn_in_len
+        return self
+    @override(AlgorithmConfig)
+    def validate(self) -> None:
+        # Call super's validation method.
+        super().validate()
+        if self.enable_rl_module_and_learner:
+            # `lr_schedule` checking.
+            if self.lr_schedule is not None:
+                self._value_error(
+                    "`lr_schedule` is deprecated and must be None! Use the "
+                    "`lr` setting to setup a schedule."
+                )
+        else:
+            if not self.in_evaluation:
+                validate_buffer_config(self)
+            # TODO (simon): Find a clean solution to deal with configuration configs
+            #  when using the new API stack.
+            if self.exploration_config["type"] == "ParameterNoise":
+                if self.batch_mode != "complete_episodes":
+                    self._value_error(
+                        "ParameterNoise Exploration requires `batch_mode` to be "
+                        "'complete_episodes'. Try setting `config.env_runners("
+                        "batch_mode='complete_episodes')`."
+                    )
+                if self.noisy:
+                    self._value_error(
+                        "ParameterNoise Exploration and `noisy` network cannot be"
+                        " used at the same time!"
+                    )
+        if self.td_error_loss_fn not in ["huber", "mse"]:
+            self._value_error("`td_error_loss_fn` must be 'huber' or 'mse'!")
+        # Check rollout_fragment_length to be compatible with n_step.
+        if (
+            not self.in_evaluation
+            and self.rollout_fragment_length != "auto"
+            and self.rollout_fragment_length < self.n_step
+        ):
+            self._value_error(
+                f"Your `rollout_fragment_length` ({self.rollout_fragment_length}) is "
+                f"smaller than `n_step` ({self.n_step})! "
+                "Try setting config.env_runners(rollout_fragment_length="
+                f"{self.n_step})."
+            )
+        # Check, if the `max_seq_len` is longer then the burn-in.
+        if (
+            "max_seq_len" in self.model_config
+            and 0 < self.model_config["max_seq_len"] <= self.burn_in_len
+        ):
+            raise ValueError(
+                f"Your defined `burn_in_len`={self.burn_in_len} is larger or equal "
+                f"`max_seq_len`={self.model_config['max_seq_len']}! Either decrease "
+                "the `burn_in_len` or increase your `max_seq_len`."
+            )
+        # Validate that we use the corresponding `EpisodeReplayBuffer` when using
+        # episodes.
+        # TODO (sven, simon): Implement the multi-agent case for replay buffers.
+        from ray.rllib.utils.replay_buffers.episode_replay_buffer import (
+            EpisodeReplayBuffer,
+        )
+        if (
+            self.enable_env_runner_and_connector_v2
+            and not isinstance(self.replay_buffer_config["type"], str)
+            and not issubclass(self.replay_buffer_config["type"], EpisodeReplayBuffer)
+        ):
+            self._value_error(
+                "When using the new `EnvRunner API` the replay buffer must be of type "
+                "`EpisodeReplayBuffer`."
+            )
+        elif not self.enable_env_runner_and_connector_v2 and (
+            (
+                isinstance(self.replay_buffer_config["type"], str)
+                and "Episode" in self.replay_buffer_config["type"]
+            )
+            or issubclass(self.replay_buffer_config["type"], EpisodeReplayBuffer)
+        ):
+            self._value_error(
+                "When using the old API stack the replay buffer must not be of type "
+                "`EpisodeReplayBuffer`! We suggest you use the following config to run "
+                "DQN on the old API stack: `config.training(replay_buffer_config={"
+                "'type': 'MultiAgentPrioritizedReplayBuffer', "
+                "'prioritized_replay_alpha': [alpha], "
+                "'prioritized_replay_beta': [beta], "
+                "'prioritized_replay_eps': [eps], "
+                "})`."
+            )
+    @override(AlgorithmConfig)
+    def get_rollout_fragment_length(self, worker_index: int = 0) -> int:
+        if self.rollout_fragment_length == "auto":
+            return (
+                self.n_step[1]
+                if isinstance(self.n_step, (tuple, list))
+                else self.n_step
+            )
+        else:
+            return self.rollout_fragment_length
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self) -> RLModuleSpecType:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.dqn.torch.default_dqn_torch_rl_module import (
+                DefaultDQNTorchRLModule,
+            )
+            return RLModuleSpec(
+                module_class=DefaultDQNTorchRLModule,
+                model_config=self.model_config,
+            )
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported! "
+                "Use `config.framework('torch')` instead."
+            )
+    @property
+    @override(AlgorithmConfig)
+    def _model_config_auto_includes(self) -> Dict[str, Any]:
+        return super()._model_config_auto_includes | {
+            "double_q": self.double_q,
+            "dueling": self.dueling,
+            "epsilon": self.epsilon,
+            "num_atoms": self.num_atoms,
+            "std_init": self.sigma0,
+            "v_max": self.v_max,
+            "v_min": self.v_min,
+        }
+    @override(AlgorithmConfig)
+    def get_default_learner_class(self) -> Union[Type["Learner"], str]:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.dqn.torch.dqn_torch_learner import (
+                DQNTorchLearner,
+            )
+            return DQNTorchLearner
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported! "
+                "Use `config.framework('torch')` instead."
+            )
+def calculate_rr_weights(config: AlgorithmConfig) -> List[float]:
+    """Calculate the round robin weights for the rollout and train steps"""
+    if not config.training_intensity:
+        return [1, 1]
+    # Calculate the "native ratio" as:
+    # [train-batch-size] / [size of env-rolled-out sampled data]
+    # This is to set freshly rollout-collected data in relation to
+    # the data we pull from the replay buffer (which also contains old
+    # samples).
+    native_ratio = config.total_train_batch_size / (
+        config.get_rollout_fragment_length()
+        * config.num_envs_per_env_runner
+        # Add one to workers because the local
+        # worker usually collects experiences as well, and we avoid division by zero.
+        * max(config.num_env_runners + 1, 1)
+    )
+    # Training intensity is specified in terms of
+    # (steps_replayed / steps_sampled), so adjust for the native ratio.
+    sample_and_train_weight = config.training_intensity / native_ratio
+    if sample_and_train_weight < 1:
+        return [int(np.round(1 / sample_and_train_weight)), 1]
+    else:
+        return [1, int(np.round(sample_and_train_weight))]
+class DQN(Algorithm):
+    @classmethod
+    @override(Algorithm)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return DQNConfig()
+    @classmethod
+    @override(Algorithm)
+    def get_default_policy_class(
+        cls, config: AlgorithmConfig
+    ) -> Optional[Type[Policy]]:
+        if config["framework"] == "torch":
+            return DQNTorchPolicy
+        else:
+            return DQNTFPolicy
+    @override(Algorithm)
+    def training_step(self) -> None:
+        """DQN training iteration function.
+        Each training iteration, we:
+        - Sample (MultiAgentBatch) from workers.
+        - Store new samples in replay buffer.
+        - Sample training batch (MultiAgentBatch) from replay buffer.
+        - Learn on training batch.
+        - Update remote workers' new policy weights.
+        - Update target network every `target_network_update_freq` sample steps.
+        - Return all collected metrics for the iteration.
+        Returns:
+            The results dict from executing the training iteration.
+        """
+        # Old API stack (Policy, RolloutWorker, Connector).
+        if not self.config.enable_env_runner_and_connector_v2:
+            return self._training_step_old_api_stack()
+        # New API stack (RLModule, Learner, EnvRunner, ConnectorV2).
+        return self._training_step_new_api_stack()
+    def _training_step_new_api_stack(self):
+        # Alternate between storing and sampling and training.
+        store_weight, sample_and_train_weight = calculate_rr_weights(self.config)
+        # Run multiple sampling + storing to buffer iterations.
+        for _ in range(store_weight):
+            with self.metrics.log_time((TIMERS, ENV_RUNNER_SAMPLING_TIMER)):
+                # Sample in parallel from workers.
+                episodes, env_runner_results = synchronous_parallel_sample(
+                    worker_set=self.env_runner_group,
+                    concat=True,
+                    sample_timeout_s=self.config.sample_timeout_s,
+                    _uses_new_env_runners=True,
+                    _return_metrics=True,
+                )
+            # Reduce EnvRunner metrics over the n EnvRunners.
+            self.metrics.merge_and_log_n_dicts(
+                env_runner_results, key=ENV_RUNNER_RESULTS
+            )
+            # Add the sampled experiences to the replay buffer.
+            with self.metrics.log_time((TIMERS, REPLAY_BUFFER_ADD_DATA_TIMER)):
+                self.local_replay_buffer.add(episodes)
+        if self.config.count_steps_by == "agent_steps":
+            current_ts = sum(
+                self.metrics.peek(
+                    (ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED_LIFETIME), default={}
+                ).values()
+            )
+        else:
+            current_ts = self.metrics.peek(
+                (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME), default=0
+            )
+        # If enough experiences have been sampled start training.
+        if current_ts >= self.config.num_steps_sampled_before_learning_starts:
+            # Run multiple sample-from-buffer and update iterations.
+            for _ in range(sample_and_train_weight):
+                # Sample a list of episodes used for learning from the replay buffer.
+                with self.metrics.log_time((TIMERS, REPLAY_BUFFER_SAMPLE_TIMER)):
+                    episodes = self.local_replay_buffer.sample(
+                        num_items=self.config.total_train_batch_size,
+                        n_step=self.config.n_step,
+                        # In case an `EpisodeReplayBuffer` is used we need to provide
+                        # the sequence length.
+                        batch_length_T=self.env_runner.module.is_stateful()
+                        * self.config.model_config.get("max_seq_len", 0),
+                        lookback=int(self.env_runner.module.is_stateful()),
+                        # TODO (simon): Implement `burn_in_len` in SAC and remove this
+                        # if-else clause.
+                        min_batch_length_T=self.config.burn_in_len
+                        if hasattr(self.config, "burn_in_len")
+                        else 0,
+                        gamma=self.config.gamma,
+                        beta=self.config.replay_buffer_config.get("beta"),
+                        sample_episodes=True,
+                    )
+                    # Get the replay buffer metrics.
+                    replay_buffer_results = self.local_replay_buffer.get_metrics()
+                    self.metrics.merge_and_log_n_dicts(
+                        [replay_buffer_results], key=REPLAY_BUFFER_RESULTS
+                    )
+                # Perform an update on the buffer-sampled train batch.
+                with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
+                    learner_results = self.learner_group.update_from_episodes(
+                        episodes=episodes,
+                        timesteps={
+                            NUM_ENV_STEPS_SAMPLED_LIFETIME: (
+                                self.metrics.peek(
+                                    (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME)
+                                )
+                            ),
+                            NUM_AGENT_STEPS_SAMPLED_LIFETIME: (
+                                self.metrics.peek(
+                                    (
+                                        ENV_RUNNER_RESULTS,
+                                        NUM_AGENT_STEPS_SAMPLED_LIFETIME,
+                                    )
+                                )
+                            ),
+                        },
+                    )
+                    # Isolate TD-errors from result dicts (we should not log these to
+                    # disk or WandB, they might be very large).
+                    td_errors = defaultdict(list)
+                    for res in learner_results:
+                        for module_id, module_results in res.items():
+                            if TD_ERROR_KEY in module_results:
+                                td_errors[module_id].extend(
+                                    convert_to_numpy(
+                                        module_results.pop(TD_ERROR_KEY).peek()
+                                    )
+                                )
+                    td_errors = {
+                        module_id: {TD_ERROR_KEY: np.concatenate(s, axis=0)}
+                        for module_id, s in td_errors.items()
+                    }
+                    self.metrics.merge_and_log_n_dicts(
+                        learner_results, key=LEARNER_RESULTS
+                    )
+                # Update replay buffer priorities.
+                with self.metrics.log_time((TIMERS, REPLAY_BUFFER_UPDATE_PRIOS_TIMER)):
+                    update_priorities_in_episode_replay_buffer(
+                        replay_buffer=self.local_replay_buffer,
+                        td_errors=td_errors,
+                    )
+            # Update weights and global_vars - after learning on the local worker -
+            # on all remote workers.
+            with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
+                modules_to_update = set(learner_results[0].keys()) - {ALL_MODULES}
+                # NOTE: the new API stack does not use global vars.
+                self.env_runner_group.sync_weights(
+                    from_worker_or_learner_group=self.learner_group,
+                    policies=modules_to_update,
+                    global_vars=None,
+                    inference_only=True,
+                )
+    def _training_step_old_api_stack(self) -> ResultDict:
+        """Training step for the old API stack.
+        More specifically this training step relies on `RolloutWorker`.
+        """
+        train_results = {}
+        # We alternate between storing new samples and sampling and training
+        store_weight, sample_and_train_weight = calculate_rr_weights(self.config)
+        for _ in range(store_weight):
+            # Sample (MultiAgentBatch) from workers.
+            with self._timers[SAMPLE_TIMER]:
+                new_sample_batch: SampleBatchType = synchronous_parallel_sample(
+                    worker_set=self.env_runner_group,
+                    concat=True,
+                    sample_timeout_s=self.config.sample_timeout_s,
+                )
+            # Return early if all our workers failed.
+            if not new_sample_batch:
+                return {}
+            # Update counters
+            self._counters[NUM_AGENT_STEPS_SAMPLED] += new_sample_batch.agent_steps()
+            self._counters[NUM_ENV_STEPS_SAMPLED] += new_sample_batch.env_steps()
+            # Store new samples in replay buffer.
+            self.local_replay_buffer.add(new_sample_batch)
+        global_vars = {
+            "timestep": self._counters[NUM_ENV_STEPS_SAMPLED],
+        }
+        # Update target network every `target_network_update_freq` sample steps.
+        cur_ts = self._counters[
+            (
+                NUM_AGENT_STEPS_SAMPLED
+                if self.config.count_steps_by == "agent_steps"
+                else NUM_ENV_STEPS_SAMPLED
+            )
+        ]
+        if cur_ts > self.config.num_steps_sampled_before_learning_starts:
+            for _ in range(sample_and_train_weight):
+                # Sample training batch (MultiAgentBatch) from replay buffer.
+                train_batch = sample_min_n_steps_from_buffer(
+                    self.local_replay_buffer,
+                    self.config.total_train_batch_size,
+                    count_by_agent_steps=self.config.count_steps_by == "agent_steps",
+                )
+                # Postprocess batch before we learn on it
+                post_fn = self.config.get("before_learn_on_batch") or (lambda b, *a: b)
+                train_batch = post_fn(train_batch, self.env_runner_group, self.config)
+                # Learn on training batch.
+                # Use simple optimizer (only for multi-agent or tf-eager; all other
+                # cases should use the multi-GPU optimizer, even if only using 1 GPU)
+                if self.config.get("simple_optimizer") is True:
+                    train_results = train_one_step(self, train_batch)
+                else:
+                    train_results = multi_gpu_train_one_step(self, train_batch)
+                # Update replay buffer priorities.
+                update_priorities_in_replay_buffer(
+                    self.local_replay_buffer,
+                    self.config,
+                    train_batch,
+                    train_results,
+                )
+                last_update = self._counters[LAST_TARGET_UPDATE_TS]
+                if cur_ts - last_update >= self.config.target_network_update_freq:
+                    to_update = self.env_runner.get_policies_to_train()
+                    self.env_runner.foreach_policy_to_train(
+                        lambda p, pid, to_update=to_update: (
+                            pid in to_update and p.update_target()
+                        )
+                    )
+                    self._counters[NUM_TARGET_UPDATES] += 1
+                    self._counters[LAST_TARGET_UPDATE_TS] = cur_ts
+                # Update weights and global_vars - after learning on the local worker -
+                # on all remote workers.
+                with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
+                    self.env_runner_group.sync_weights(global_vars=global_vars)
+        # Return all collected metrics for the iteration.
+        return train_results

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_catalog.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import gymnasium as gym
+from ray.rllib.core.models.catalog import Catalog
+from ray.rllib.core.models.base import Model
+from ray.rllib.core.models.configs import MLPHeadConfig
+from ray.rllib.models.torch.torch_distributions import TorchCategorical
+from ray.rllib.utils.annotations import (
+    ExperimentalAPI,
+    override,
+    OverrideToImplementCustomLogic,
+)
+@ExperimentalAPI
+class DQNCatalog(Catalog):
+    """The catalog class used to build models for DQN Rainbow.
+    `DQNCatalog` provides the following models:
+        - Encoder: The encoder used to encode the observations.
+        - Target_Encoder: The encoder used to encode the observations
+            for the target network.
+        - Af Head: Either the head of the advantage stream, if a dueling
+            architecture is used or the head of the Q-function. This is
+            a multi-node head with `action_space.n` many nodes in case
+            of expectation learning and `action_space.n` times the number
+            of atoms (`num_atoms`) in case of distributional Q-learning.
+        - Vf Head (optional): The head of the value function in case a
+            dueling architecture is chosen. This is a single node head.
+            If no dueling architecture is used, this head does not exist.
+    Any custom head can be built by overridng the `build_af_head()` and
+    `build_vf_head()`. Alternatively, the `AfHeadConfig` or `VfHeadConfig`
+    can be overridden to build custom logic during `RLModule` runtime.
+    All heads can optionally use distributional learning. In this case the
+    number of output neurons corresponds to the number of actions times the
+    number of support atoms of the discrete distribution.
+    Any module built for exploration or inference is built with the flag
+    `ìnference_only=True` and does not contain any target networks. This flag can
+    be set in a `SingleAgentModuleSpec` through the `inference_only` boolean flag.
+    """
+    @override(Catalog)
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        model_config_dict: dict,
+        view_requirements: dict = None,
+    ):
+        """Initializes the DQNCatalog.
+        Args:
+            observation_space: The observation space of the Encoder.
+            action_space: The action space for the Af Head.
+            model_config_dict: The model config to use.
+        """
+        assert view_requirements is None, (
+            "Instead, use the new ConnectorV2 API to pick whatever information "
+            "you need from the running episodes"
+        )
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+            model_config_dict=model_config_dict,
+        )
+        # The number of atoms to be used for distributional Q-learning.
+        self.num_atoms: bool = self._model_config_dict["num_atoms"]
+        # Advantage and value streams have MLP heads. Note, the advantage
+        # stream will has an output dimension that is the product of the
+        # action space dimension and the number of atoms to approximate the
+        # return distribution in distributional reinforcement learning.
+        self.af_head_config = self._get_head_config(
+            output_layer_dim=int(self.action_space.n * self.num_atoms)
+        )
+        self.vf_head_config = self._get_head_config(output_layer_dim=1)
+    @OverrideToImplementCustomLogic
+    def build_af_head(self, framework: str) -> Model:
+        """Build the A/Q-function head.
+        Note, if no dueling architecture is chosen, this will
+        be the Q-function head.
+        The default behavior is to build the head from the `af_head_config`.
+        This can be overridden to build a custom policy head as a means to
+        configure the behavior of a `DQNRLModule` implementation.
+        Args:
+            framework: The framework to use. Either "torch" or "tf2".
+        Returns:
+            The advantage head in case a dueling architecutre is chosen or
+            the Q-function head in the other case.
+        """
+        return self.af_head_config.build(framework=framework)
+    @OverrideToImplementCustomLogic
+    def build_vf_head(self, framework: str) -> Model:
+        """Build the value function head.
+        Note, this function is only called in case of a dueling architecture.
+        The default behavior is to build the head from the `vf_head_config`.
+        This can be overridden to build a custom policy head as a means to
+        configure the behavior of a `DQNRLModule` implementation.
+        Args:
+            framework: The framework to use. Either "torch" or "tf2".
+        Returns:
+            The value function head.
+        """
+        return self.vf_head_config.build(framework=framework)
+    @override(Catalog)
+    def get_action_dist_cls(self, framework: str) -> "TorchCategorical":
+        # We only implement DQN Rainbow for Torch.
+        if framework != "torch":
+            raise ValueError("DQN Rainbow is only supported for framework `torch`.")
+        else:
+            return TorchCategorical
+    def _get_head_config(self, output_layer_dim: int):
+        """Returns a head config.
+        Args:
+            output_layer_dim: Integer defining the output layer dimension.
+                This is 1 for the Vf-head and `action_space.n * num_atoms`
+                for the Af(Qf)-head.
+        Returns:
+            A `MLPHeadConfig`.
+        """
+        # Return the appropriate config.
+        return MLPHeadConfig(
+            input_dims=self.latent_dims,
+            hidden_layer_dims=self._model_config_dict["head_fcnet_hiddens"],
+            # Note, `"post_fcnet_activation"` is `"relu"` by definition.
+            hidden_layer_activation=self._model_config_dict["head_fcnet_activation"],
+            # TODO (simon): Not yet available.
+            # hidden_layer_use_layernorm=self._model_config_dict[
+            #     "hidden_layer_use_layernorm"
+            # ],
+            # hidden_layer_use_bias=self._model_config_dict["hidden_layer_use_bias"],
+            hidden_layer_weights_initializer=self._model_config_dict[
+                "head_fcnet_kernel_initializer"
+            ],
+            hidden_layer_weights_initializer_config=self._model_config_dict[
+                "head_fcnet_kernel_initializer_kwargs"
+            ],
+            hidden_layer_bias_initializer=self._model_config_dict[
+                "head_fcnet_bias_initializer"
+            ],
+            hidden_layer_bias_initializer_config=self._model_config_dict[
+                "head_fcnet_bias_initializer_kwargs"
+            ],
+            output_layer_activation="linear",
+            output_layer_dim=output_layer_dim,
+            # TODO (simon): Not yet available.
+            # output_layer_use_bias=self._model_config_dict["output_layer_use_bias"],
+            output_layer_weights_initializer=self._model_config_dict[
+                "head_fcnet_kernel_initializer"
+            ],
+            output_layer_weights_initializer_config=self._model_config_dict[
+                "head_fcnet_kernel_initializer_kwargs"
+            ],
+            output_layer_bias_initializer=self._model_config_dict[
+                "head_fcnet_bias_initializer"
+            ],
+            output_layer_bias_initializer_config=self._model_config_dict[
+                "head_fcnet_bias_initializer_kwargs"
+            ],
+        )

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_learner.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from typing import Any, Dict, Optional
+from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import (
+    AddObservationsFromEpisodesToBatch,
+)
+from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import (  # noqa
+    AddNextObservationsFromEpisodesToTrainBatch,
+)
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.learner.utils import update_target_network
+from ray.rllib.core.rl_module.apis import QNetAPI, TargetNetworkAPI
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.rllib.utils.metrics import (
+    LAST_TARGET_UPDATE_TS,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    NUM_TARGET_UPDATES,
+)
+from ray.rllib.utils.typing import ModuleID, ShouldModuleBeUpdatedFn
+# Now, this is double defined: In `SACRLModule` and here. I would keep it here
+# or push it into the `Learner` as these are recurring keys in RL.
+ATOMS = "atoms"
+QF_LOSS_KEY = "qf_loss"
+QF_LOGITS = "qf_logits"
+QF_MEAN_KEY = "qf_mean"
+QF_MAX_KEY = "qf_max"
+QF_MIN_KEY = "qf_min"
+QF_NEXT_PREDS = "qf_next_preds"
+QF_TARGET_NEXT_PREDS = "qf_target_next_preds"
+QF_TARGET_NEXT_PROBS = "qf_target_next_probs"
+QF_PREDS = "qf_preds"
+QF_PROBS = "qf_probs"
+TD_ERROR_MEAN_KEY = "td_error_mean"
+class DQNLearner(Learner):
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    @override(Learner)
+    def build(self) -> None:
+        super().build()
+        # Make target networks.
+        self.module.foreach_module(
+            lambda mid, mod: (
+                mod.make_target_networks()
+                if isinstance(mod, TargetNetworkAPI)
+                else None
+            )
+        )
+        # Prepend the "add-NEXT_OBS-from-episodes-to-train-batch" connector piece (right
+        # after the corresponding "add-OBS-..." default piece).
+        self._learner_connector.insert_after(
+            AddObservationsFromEpisodesToBatch,
+            AddNextObservationsFromEpisodesToTrainBatch(),
+        )
+    @override(Learner)
+    def add_module(
+        self,
+        *,
+        module_id: ModuleID,
+        module_spec: RLModuleSpec,
+        config_overrides: Optional[Dict] = None,
+        new_should_module_be_updated: Optional[ShouldModuleBeUpdatedFn] = None,
+    ) -> MultiRLModuleSpec:
+        marl_spec = super().add_module(
+            module_id=module_id,
+            module_spec=module_spec,
+            config_overrides=config_overrides,
+            new_should_module_be_updated=new_should_module_be_updated,
+        )
+        # Create target networks for added Module, if applicable.
+        if isinstance(self.module[module_id].unwrapped(), TargetNetworkAPI):
+            self.module[module_id].unwrapped().make_target_networks()
+        return marl_spec
+    @override(Learner)
+    def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None:
+        """Updates the target Q Networks."""
+        super().after_gradient_based_update(timesteps=timesteps)
+        timestep = timesteps.get(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0)
+        # TODO (sven): Maybe we should have a `after_gradient_based_update`
+        #  method per module?
+        for module_id, module in self.module._rl_modules.items():
+            config = self.config.get_config_for_module(module_id)
+            last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS)
+            if timestep - self.metrics.peek(
+                last_update_ts_key, default=0
+            ) >= config.target_network_update_freq and isinstance(
+                module.unwrapped(), TargetNetworkAPI
+            ):
+                for (
+                    main_net,
+                    target_net,
+                ) in module.unwrapped().get_target_network_pairs():
+                    update_target_network(
+                        main_net=main_net,
+                        target_net=target_net,
+                        tau=config.tau,
+                    )
+                # Increase lifetime target network update counter by one.
+                self.metrics.log_value((module_id, NUM_TARGET_UPDATES), 1, reduce="sum")
+                # Update the (single-value -> window=1) last updated timestep metric.
+                self.metrics.log_value(last_update_ts_key, timestep, window=1)
+    @classmethod
+    @override(Learner)
+    def rl_module_required_apis(cls) -> list[type]:
+        # In order for a PPOLearner to update an RLModule, it must implement the
+        # following APIs:
+        return [QNetAPI, TargetNetworkAPI]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_tf_policy.py ADDED Viewed

	@@ -0,0 +1,511 @@

+from typing import Dict
+import gymnasium as gym
+import numpy as np
+import ray
+from ray.rllib.algorithms.dqn.distributional_q_tf_model import DistributionalQTFModel
+from ray.rllib.evaluation.postprocessing import adjust_nstep
+from ray.rllib.models import ModelCatalog
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_action_dist import get_categorical_class_with_temperature
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.tf_mixins import LearningRateSchedule, TargetNetworkMixin
+from ray.rllib.policy.tf_policy_template import build_tf_policy
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.exploration import ParameterNoise
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.tf_utils import (
+    huber_loss,
+    l2_loss,
+    make_tf_callable,
+    minimize_and_clip,
+    reduce_mean_ignore_inf,
+)
+from ray.rllib.utils.typing import AlgorithmConfigDict, ModelGradients, TensorType
+tf1, tf, tfv = try_import_tf()
+# Importance sampling weights for prioritized replay
+PRIO_WEIGHTS = "weights"
+Q_SCOPE = "q_func"
+Q_TARGET_SCOPE = "target_q_func"
+@OldAPIStack
+class QLoss:
+    def __init__(
+        self,
+        q_t_selected: TensorType,
+        q_logits_t_selected: TensorType,
+        q_tp1_best: TensorType,
+        q_dist_tp1_best: TensorType,
+        importance_weights: TensorType,
+        rewards: TensorType,
+        done_mask: TensorType,
+        gamma: float = 0.99,
+        n_step: int = 1,
+        num_atoms: int = 1,
+        v_min: float = -10.0,
+        v_max: float = 10.0,
+        loss_fn=huber_loss,
+    ):
+        if num_atoms > 1:
+            # Distributional Q-learning which corresponds to an entropy loss
+            z = tf.range(num_atoms, dtype=tf.float32)
+            z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
+            # (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms)
+            r_tau = tf.expand_dims(rewards, -1) + gamma**n_step * tf.expand_dims(
+                1.0 - done_mask, -1
+            ) * tf.expand_dims(z, 0)
+            r_tau = tf.clip_by_value(r_tau, v_min, v_max)
+            b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
+            lb = tf.floor(b)
+            ub = tf.math.ceil(b)
+            # indispensable judgement which is missed in most implementations
+            # when b happens to be an integer, lb == ub, so pr_j(s', a*) will
+            # be discarded because (ub-b) == (b-lb) == 0
+            floor_equal_ceil = tf.cast(tf.less(ub - lb, 0.5), tf.float32)
+            l_project = tf.one_hot(
+                tf.cast(lb, dtype=tf.int32), num_atoms
+            )  # (batch_size, num_atoms, num_atoms)
+            u_project = tf.one_hot(
+                tf.cast(ub, dtype=tf.int32), num_atoms
+            )  # (batch_size, num_atoms, num_atoms)
+            ml_delta = q_dist_tp1_best * (ub - b + floor_equal_ceil)
+            mu_delta = q_dist_tp1_best * (b - lb)
+            ml_delta = tf.reduce_sum(l_project * tf.expand_dims(ml_delta, -1), axis=1)
+            mu_delta = tf.reduce_sum(u_project * tf.expand_dims(mu_delta, -1), axis=1)
+            m = ml_delta + mu_delta
+            # Rainbow paper claims that using this cross entropy loss for
+            # priority is robust and insensitive to `prioritized_replay_alpha`
+            self.td_error = tf.nn.softmax_cross_entropy_with_logits(
+                labels=m, logits=q_logits_t_selected
+            )
+            self.loss = tf.reduce_mean(
+                self.td_error * tf.cast(importance_weights, tf.float32)
+            )
+            self.stats = {
+                # TODO: better Q stats for dist dqn
+                "mean_td_error": tf.reduce_mean(self.td_error),
+            }
+        else:
+            q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
+            # compute RHS of bellman equation
+            q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
+            # compute the error (potentially clipped)
+            self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
+            self.loss = tf.reduce_mean(
+                tf.cast(importance_weights, tf.float32) * loss_fn(self.td_error)
+            )
+            self.stats = {
+                "mean_q": tf.reduce_mean(q_t_selected),
+                "min_q": tf.reduce_min(q_t_selected),
+                "max_q": tf.reduce_max(q_t_selected),
+                "mean_td_error": tf.reduce_mean(self.td_error),
+            }
+@OldAPIStack
+class ComputeTDErrorMixin:
+    """Assign the `compute_td_error` method to the DQNTFPolicy
+    This allows us to prioritize on the worker side.
+    """
+    def __init__(self):
+        @make_tf_callable(self.get_session(), dynamic_shape=True)
+        def compute_td_error(
+            obs_t, act_t, rew_t, obs_tp1, terminateds_mask, importance_weights
+        ):
+            # Do forward pass on loss to update td error attribute
+            build_q_losses(
+                self,
+                self.model,
+                None,
+                {
+                    SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_t),
+                    SampleBatch.ACTIONS: tf.convert_to_tensor(act_t),
+                    SampleBatch.REWARDS: tf.convert_to_tensor(rew_t),
+                    SampleBatch.NEXT_OBS: tf.convert_to_tensor(obs_tp1),
+                    SampleBatch.TERMINATEDS: tf.convert_to_tensor(terminateds_mask),
+                    PRIO_WEIGHTS: tf.convert_to_tensor(importance_weights),
+                },
+            )
+            return self.q_loss.td_error
+        self.compute_td_error = compute_td_error
+@OldAPIStack
+def build_q_model(
+    policy: Policy,
+    obs_space: gym.spaces.Space,
+    action_space: gym.spaces.Space,
+    config: AlgorithmConfigDict,
+) -> ModelV2:
+    """Build q_model and target_model for DQN
+    Args:
+        policy: The Policy, which will use the model for optimization.
+        obs_space (gym.spaces.Space): The policy's observation space.
+        action_space (gym.spaces.Space): The policy's action space.
+        config (AlgorithmConfigDict):
+    Returns:
+        ModelV2: The Model for the Policy to use.
+            Note: The target q model will not be returned, just assigned to
+            `policy.target_model`.
+    """
+    if not isinstance(action_space, gym.spaces.Discrete):
+        raise UnsupportedSpaceException(
+            "Action space {} is not supported for DQN.".format(action_space)
+        )
+    if config["hiddens"]:
+        # try to infer the last layer size, otherwise fall back to 256
+        num_outputs = ([256] + list(config["model"]["fcnet_hiddens"]))[-1]
+        config["model"]["no_final_linear"] = True
+    else:
+        num_outputs = action_space.n
+    q_model = ModelCatalog.get_model_v2(
+        obs_space=obs_space,
+        action_space=action_space,
+        num_outputs=num_outputs,
+        model_config=config["model"],
+        framework="tf",
+        model_interface=DistributionalQTFModel,
+        name=Q_SCOPE,
+        num_atoms=config["num_atoms"],
+        dueling=config["dueling"],
+        q_hiddens=config["hiddens"],
+        use_noisy=config["noisy"],
+        v_min=config["v_min"],
+        v_max=config["v_max"],
+        sigma0=config["sigma0"],
+        # TODO(sven): Move option to add LayerNorm after each Dense
+        #  generically into ModelCatalog.
+        add_layer_norm=isinstance(getattr(policy, "exploration", None), ParameterNoise)
+        or config["exploration_config"]["type"] == "ParameterNoise",
+    )
+    policy.target_model = ModelCatalog.get_model_v2(
+        obs_space=obs_space,
+        action_space=action_space,
+        num_outputs=num_outputs,
+        model_config=config["model"],
+        framework="tf",
+        model_interface=DistributionalQTFModel,
+        name=Q_TARGET_SCOPE,
+        num_atoms=config["num_atoms"],
+        dueling=config["dueling"],
+        q_hiddens=config["hiddens"],
+        use_noisy=config["noisy"],
+        v_min=config["v_min"],
+        v_max=config["v_max"],
+        sigma0=config["sigma0"],
+        # TODO(sven): Move option to add LayerNorm after each Dense
+        #  generically into ModelCatalog.
+        add_layer_norm=isinstance(getattr(policy, "exploration", None), ParameterNoise)
+        or config["exploration_config"]["type"] == "ParameterNoise",
+    )
+    return q_model
+@OldAPIStack
+def get_distribution_inputs_and_class(
+    policy: Policy, model: ModelV2, input_dict: SampleBatch, *, explore=True, **kwargs
+):
+    q_vals = compute_q_values(
+        policy, model, input_dict, state_batches=None, explore=explore
+    )
+    q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals
+    policy.q_values = q_vals
+    # Return a Torch TorchCategorical distribution where the temperature
+    # parameter is partially binded to the configured value.
+    temperature = policy.config["categorical_distribution_temperature"]
+    return (
+        policy.q_values,
+        get_categorical_class_with_temperature(temperature),
+        [],
+    )  # state-out
+@OldAPIStack
+def build_q_losses(policy: Policy, model, _, train_batch: SampleBatch) -> TensorType:
+    """Constructs the loss for DQNTFPolicy.
+    Args:
+        policy: The Policy to calculate the loss for.
+        model (ModelV2): The Model to calculate the loss for.
+        train_batch: The training data.
+    Returns:
+        TensorType: A single loss tensor.
+    """
+    config = policy.config
+    # q network evaluation
+    q_t, q_logits_t, q_dist_t, _ = compute_q_values(
+        policy,
+        model,
+        SampleBatch({"obs": train_batch[SampleBatch.CUR_OBS]}),
+        state_batches=None,
+        explore=False,
+    )
+    # target q network evalution
+    q_tp1, q_logits_tp1, q_dist_tp1, _ = compute_q_values(
+        policy,
+        policy.target_model,
+        SampleBatch({"obs": train_batch[SampleBatch.NEXT_OBS]}),
+        state_batches=None,
+        explore=False,
+    )
+    if not hasattr(policy, "target_q_func_vars"):
+        policy.target_q_func_vars = policy.target_model.variables()
+    # q scores for actions which we know were selected in the given state.
+    one_hot_selection = tf.one_hot(
+        tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32), policy.action_space.n
+    )
+    q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
+    q_logits_t_selected = tf.reduce_sum(
+        q_logits_t * tf.expand_dims(one_hot_selection, -1), 1
+    )
+    # compute estimate of best possible value starting from state at t + 1
+    if config["double_q"]:
+        (
+            q_tp1_using_online_net,
+            q_logits_tp1_using_online_net,
+            q_dist_tp1_using_online_net,
+            _,
+        ) = compute_q_values(
+            policy,
+            model,
+            SampleBatch({"obs": train_batch[SampleBatch.NEXT_OBS]}),
+            state_batches=None,
+            explore=False,
+        )
+        q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
+        q_tp1_best_one_hot_selection = tf.one_hot(
+            q_tp1_best_using_online_net, policy.action_space.n
+        )
+        q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
+        q_dist_tp1_best = tf.reduce_sum(
+            q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1
+        )
+    else:
+        q_tp1_best_one_hot_selection = tf.one_hot(
+            tf.argmax(q_tp1, 1), policy.action_space.n
+        )
+        q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
+        q_dist_tp1_best = tf.reduce_sum(
+            q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1
+        )
+    loss_fn = huber_loss if policy.config["td_error_loss_fn"] == "huber" else l2_loss
+    policy.q_loss = QLoss(
+        q_t_selected,
+        q_logits_t_selected,
+        q_tp1_best,
+        q_dist_tp1_best,
+        train_batch[PRIO_WEIGHTS],
+        tf.cast(train_batch[SampleBatch.REWARDS], tf.float32),
+        tf.cast(train_batch[SampleBatch.TERMINATEDS], tf.float32),
+        config["gamma"],
+        config["n_step"],
+        config["num_atoms"],
+        config["v_min"],
+        config["v_max"],
+        loss_fn,
+    )
+    return policy.q_loss.loss
+@OldAPIStack
+def adam_optimizer(
+    policy: Policy, config: AlgorithmConfigDict
+) -> "tf.keras.optimizers.Optimizer":
+    if policy.config["framework"] == "tf2":
+        return tf.keras.optimizers.Adam(
+            learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"]
+        )
+    else:
+        return tf1.train.AdamOptimizer(
+            learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"]
+        )
+@OldAPIStack
+def clip_gradients(
+    policy: Policy, optimizer: "tf.keras.optimizers.Optimizer", loss: TensorType
+) -> ModelGradients:
+    if not hasattr(policy, "q_func_vars"):
+        policy.q_func_vars = policy.model.variables()
+    return minimize_and_clip(
+        optimizer,
+        loss,
+        var_list=policy.q_func_vars,
+        clip_val=policy.config["grad_clip"],
+    )
+@OldAPIStack
+def build_q_stats(policy: Policy, batch) -> Dict[str, TensorType]:
+    return dict(
+        {
+            "cur_lr": tf.cast(policy.cur_lr, tf.float64),
+        },
+        **policy.q_loss.stats
+    )
+@OldAPIStack
+def setup_mid_mixins(policy: Policy, obs_space, action_space, config) -> None:
+    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+    ComputeTDErrorMixin.__init__(policy)
+@OldAPIStack
+def setup_late_mixins(
+    policy: Policy,
+    obs_space: gym.spaces.Space,
+    action_space: gym.spaces.Space,
+    config: AlgorithmConfigDict,
+) -> None:
+    TargetNetworkMixin.__init__(policy)
+@OldAPIStack
+def compute_q_values(
+    policy: Policy,
+    model: ModelV2,
+    input_batch: SampleBatch,
+    state_batches=None,
+    seq_lens=None,
+    explore=None,
+    is_training: bool = False,
+):
+    config = policy.config
+    model_out, state = model(input_batch, state_batches or [], seq_lens)
+    if config["num_atoms"] > 1:
+        (
+            action_scores,
+            z,
+            support_logits_per_action,
+            logits,
+            dist,
+        ) = model.get_q_value_distributions(model_out)
+    else:
+        (action_scores, logits, dist) = model.get_q_value_distributions(model_out)
+    if config["dueling"]:
+        state_score = model.get_state_value(model_out)
+        if config["num_atoms"] > 1:
+            support_logits_per_action_mean = tf.reduce_mean(
+                support_logits_per_action, 1
+            )
+            support_logits_per_action_centered = (
+                support_logits_per_action
+                - tf.expand_dims(support_logits_per_action_mean, 1)
+            )
+            support_logits_per_action = (
+                tf.expand_dims(state_score, 1) + support_logits_per_action_centered
+            )
+            support_prob_per_action = tf.nn.softmax(logits=support_logits_per_action)
+            value = tf.reduce_sum(input_tensor=z * support_prob_per_action, axis=-1)
+            logits = support_logits_per_action
+            dist = support_prob_per_action
+        else:
+            action_scores_mean = reduce_mean_ignore_inf(action_scores, 1)
+            action_scores_centered = action_scores - tf.expand_dims(
+                action_scores_mean, 1
+            )
+            value = state_score + action_scores_centered
+    else:
+        value = action_scores
+    return value, logits, dist, state
+@OldAPIStack
+def postprocess_nstep_and_prio(
+    policy: Policy, batch: SampleBatch, other_agent=None, episode=None
+) -> SampleBatch:
+    # N-step Q adjustments.
+    if policy.config["n_step"] > 1:
+        adjust_nstep(policy.config["n_step"], policy.config["gamma"], batch)
+    # Create dummy prio-weights (1.0) in case we don't have any in
+    # the batch.
+    if PRIO_WEIGHTS not in batch:
+        batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])
+    # Prioritize on the worker side.
+    if batch.count > 0 and policy.config["replay_buffer_config"].get(
+        "worker_side_prioritization", False
+    ):
+        td_errors = policy.compute_td_error(
+            batch[SampleBatch.OBS],
+            batch[SampleBatch.ACTIONS],
+            batch[SampleBatch.REWARDS],
+            batch[SampleBatch.NEXT_OBS],
+            batch[SampleBatch.TERMINATEDS],
+            batch[PRIO_WEIGHTS],
+        )
+        # Retain compatibility with old-style Replay args
+        epsilon = policy.config.get("replay_buffer_config", {}).get(
+            "prioritized_replay_eps"
+        ) or policy.config.get("prioritized_replay_eps")
+        if epsilon is None:
+            raise ValueError("prioritized_replay_eps not defined in config.")
+        new_priorities = np.abs(convert_to_numpy(td_errors)) + epsilon
+        batch[PRIO_WEIGHTS] = new_priorities
+    return batch
+DQNTFPolicy = build_tf_policy(
+    name="DQNTFPolicy",
+    get_default_config=lambda: ray.rllib.algorithms.dqn.dqn.DQNConfig(),
+    make_model=build_q_model,
+    action_distribution_fn=get_distribution_inputs_and_class,
+    loss_fn=build_q_losses,
+    stats_fn=build_q_stats,
+    postprocess_fn=postprocess_nstep_and_prio,
+    optimizer_fn=adam_optimizer,
+    compute_gradients_fn=clip_gradients,
+    extra_action_out_fn=lambda policy: {"q_values": policy.q_values},
+    extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error},
+    before_loss_init=setup_mid_mixins,
+    after_init=setup_late_mixins,
+    mixins=[
+        TargetNetworkMixin,
+        ComputeTDErrorMixin,
+        LearningRateSchedule,
+    ],
+)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_torch_model.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""PyTorch model for DQN"""
+from typing import Sequence
+import gymnasium as gym
+from ray.rllib.models.torch.misc import SlimFC
+from ray.rllib.models.torch.modules.noisy_layer import NoisyLayer
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModelConfigDict
+torch, nn = try_import_torch()
+@OldAPIStack
+class DQNTorchModel(TorchModelV2, nn.Module):
+    """Extension of standard TorchModelV2 to provide dueling-Q functionality."""
+    def __init__(
+        self,
+        obs_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        num_outputs: int,
+        model_config: ModelConfigDict,
+        name: str,
+        *,
+        q_hiddens: Sequence[int] = (256,),
+        dueling: bool = False,
+        dueling_activation: str = "relu",
+        num_atoms: int = 1,
+        use_noisy: bool = False,
+        v_min: float = -10.0,
+        v_max: float = 10.0,
+        sigma0: float = 0.5,
+        # TODO(sven): Move `add_layer_norm` into ModelCatalog as
+        #  generic option, then error if we use ParameterNoise as
+        #  Exploration type and do not have any LayerNorm layers in
+        #  the net.
+        add_layer_norm: bool = False
+    ):
+        """Initialize variables of this model.
+        Extra model kwargs:
+            q_hiddens (Sequence[int]): List of layer-sizes after(!) the
+                Advantages(A)/Value(V)-split. Hence, each of the A- and V-
+                branches will have this structure of Dense layers. To define
+                the NN before this A/V-split, use - as always -
+                config["model"]["fcnet_hiddens"].
+            dueling: Whether to build the advantage(A)/value(V) heads
+                for DDQN. If True, Q-values are calculated as:
+                Q = (A - mean[A]) + V. If False, raw NN output is interpreted
+                as Q-values.
+            dueling_activation: The activation to use for all dueling
+                layers (A- and V-branch). One of "relu", "tanh", "linear".
+            num_atoms: If >1, enables distributional DQN.
+            use_noisy: Use noisy layers.
+            v_min: Min value support for distributional DQN.
+            v_max: Max value support for distributional DQN.
+            sigma0 (float): Initial value of noisy layers.
+            add_layer_norm: Enable layer norm (for param noise).
+        """
+        nn.Module.__init__(self)
+        super(DQNTorchModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+        self.dueling = dueling
+        self.num_atoms = num_atoms
+        self.v_min = v_min
+        self.v_max = v_max
+        self.sigma0 = sigma0
+        ins = num_outputs
+        advantage_module = nn.Sequential()
+        value_module = nn.Sequential()
+        # Dueling case: Build the shared (advantages and value) fc-network.
+        for i, n in enumerate(q_hiddens):
+            if use_noisy:
+                advantage_module.add_module(
+                    "dueling_A_{}".format(i),
+                    NoisyLayer(
+                        ins, n, sigma0=self.sigma0, activation=dueling_activation
+                    ),
+                )
+                value_module.add_module(
+                    "dueling_V_{}".format(i),
+                    NoisyLayer(
+                        ins, n, sigma0=self.sigma0, activation=dueling_activation
+                    ),
+                )
+            else:
+                advantage_module.add_module(
+                    "dueling_A_{}".format(i),
+                    SlimFC(ins, n, activation_fn=dueling_activation),
+                )
+                value_module.add_module(
+                    "dueling_V_{}".format(i),
+                    SlimFC(ins, n, activation_fn=dueling_activation),
+                )
+                # Add LayerNorm after each Dense.
+                if add_layer_norm:
+                    advantage_module.add_module(
+                        "LayerNorm_A_{}".format(i), nn.LayerNorm(n)
+                    )
+                    value_module.add_module("LayerNorm_V_{}".format(i), nn.LayerNorm(n))
+            ins = n
+        # Actual Advantages layer (nodes=num-actions).
+        if use_noisy:
+            advantage_module.add_module(
+                "A",
+                NoisyLayer(
+                    ins, self.action_space.n * self.num_atoms, sigma0, activation=None
+                ),
+            )
+        elif q_hiddens:
+            advantage_module.add_module(
+                "A", SlimFC(ins, action_space.n * self.num_atoms, activation_fn=None)
+            )
+        self.advantage_module = advantage_module
+        # Value layer (nodes=1).
+        if self.dueling:
+            if use_noisy:
+                value_module.add_module(
+                    "V", NoisyLayer(ins, self.num_atoms, sigma0, activation=None)
+                )
+            elif q_hiddens:
+                value_module.add_module(
+                    "V", SlimFC(ins, self.num_atoms, activation_fn=None)
+                )
+            self.value_module = value_module
+    def get_q_value_distributions(self, model_out):
+        """Returns distributional values for Q(s, a) given a state embedding.
+        Override this in your custom model to customize the Q output head.
+        Args:
+            model_out: Embedding from the model layers.
+        Returns:
+            (action_scores, logits, dist) if num_atoms == 1, otherwise
+            (action_scores, z, support_logits_per_action, logits, dist)
+        """
+        action_scores = self.advantage_module(model_out)
+        if self.num_atoms > 1:
+            # Distributional Q-learning uses a discrete support z
+            # to represent the action value distribution
+            z = torch.arange(0.0, self.num_atoms, dtype=torch.float32).to(
+                action_scores.device
+            )
+            z = self.v_min + z * (self.v_max - self.v_min) / float(self.num_atoms - 1)
+            support_logits_per_action = torch.reshape(
+                action_scores, shape=(-1, self.action_space.n, self.num_atoms)
+            )
+            support_prob_per_action = nn.functional.softmax(
+                support_logits_per_action, dim=-1
+            )
+            action_scores = torch.sum(z * support_prob_per_action, dim=-1)
+            logits = support_logits_per_action
+            probs = support_prob_per_action
+            return action_scores, z, support_logits_per_action, logits, probs
+        else:
+            logits = torch.unsqueeze(torch.ones_like(action_scores), -1)
+            return action_scores, logits, logits
+    def get_state_value(self, model_out):
+        """Returns the state value prediction for the given state embedding."""
+        return self.value_module(model_out)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/dqn_torch_policy.py ADDED Viewed

	@@ -0,0 +1,518 @@

+"""PyTorch policy class used for DQN"""
+from typing import Dict, List, Tuple
+import gymnasium as gym
+import ray
+from ray.rllib.algorithms.dqn.dqn_tf_policy import (
+    PRIO_WEIGHTS,
+    Q_SCOPE,
+    Q_TARGET_SCOPE,
+    postprocess_nstep_and_prio,
+)
+from ray.rllib.algorithms.dqn.dqn_torch_model import DQNTorchModel
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.torch.torch_action_dist import (
+    get_torch_categorical_class_with_temperature,
+    TorchDistributionWrapper,
+)
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy_template import build_policy_class
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_mixins import (
+    LearningRateSchedule,
+    TargetNetworkMixin,
+)
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.exploration.parameter_noise import ParameterNoise
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.torch_utils import (
+    apply_grad_clipping,
+    concat_multi_gpu_td_errors,
+    FLOAT_MIN,
+    huber_loss,
+    l2_loss,
+    reduce_mean_ignore_inf,
+    softmax_cross_entropy_with_logits,
+)
+from ray.rllib.utils.typing import TensorType, AlgorithmConfigDict
+torch, nn = try_import_torch()
+F = None
+if nn:
+    F = nn.functional
+@OldAPIStack
+class QLoss:
+    def __init__(
+        self,
+        q_t_selected: TensorType,
+        q_logits_t_selected: TensorType,
+        q_tp1_best: TensorType,
+        q_probs_tp1_best: TensorType,
+        importance_weights: TensorType,
+        rewards: TensorType,
+        done_mask: TensorType,
+        gamma=0.99,
+        n_step=1,
+        num_atoms=1,
+        v_min=-10.0,
+        v_max=10.0,
+        loss_fn=huber_loss,
+    ):
+        if num_atoms > 1:
+            # Distributional Q-learning which corresponds to an entropy loss
+            z = torch.arange(0.0, num_atoms, dtype=torch.float32).to(rewards.device)
+            z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
+            # (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms)
+            r_tau = torch.unsqueeze(rewards, -1) + gamma**n_step * torch.unsqueeze(
+                1.0 - done_mask, -1
+            ) * torch.unsqueeze(z, 0)
+            r_tau = torch.clamp(r_tau, v_min, v_max)
+            b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
+            lb = torch.floor(b)
+            ub = torch.ceil(b)
+            # Indispensable judgement which is missed in most implementations
+            # when b happens to be an integer, lb == ub, so pr_j(s', a*) will
+            # be discarded because (ub-b) == (b-lb) == 0.
+            floor_equal_ceil = ((ub - lb) < 0.5).float()
+            # (batch_size, num_atoms, num_atoms)
+            l_project = F.one_hot(lb.long(), num_atoms)
+            # (batch_size, num_atoms, num_atoms)
+            u_project = F.one_hot(ub.long(), num_atoms)
+            ml_delta = q_probs_tp1_best * (ub - b + floor_equal_ceil)
+            mu_delta = q_probs_tp1_best * (b - lb)
+            ml_delta = torch.sum(l_project * torch.unsqueeze(ml_delta, -1), dim=1)
+            mu_delta = torch.sum(u_project * torch.unsqueeze(mu_delta, -1), dim=1)
+            m = ml_delta + mu_delta
+            # Rainbow paper claims that using this cross entropy loss for
+            # priority is robust and insensitive to `prioritized_replay_alpha`
+            self.td_error = softmax_cross_entropy_with_logits(
+                logits=q_logits_t_selected, labels=m.detach()
+            )
+            self.loss = torch.mean(self.td_error * importance_weights)
+            self.stats = {
+                # TODO: better Q stats for dist dqn
+            }
+        else:
+            q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
+            # compute RHS of bellman equation
+            q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
+            # compute the error (potentially clipped)
+            self.td_error = q_t_selected - q_t_selected_target.detach()
+            self.loss = torch.mean(importance_weights.float() * loss_fn(self.td_error))
+            self.stats = {
+                "mean_q": torch.mean(q_t_selected),
+                "min_q": torch.min(q_t_selected),
+                "max_q": torch.max(q_t_selected),
+            }
+@OldAPIStack
+class ComputeTDErrorMixin:
+    """Assign the `compute_td_error` method to the DQNTorchPolicy
+    This allows us to prioritize on the worker side.
+    """
+    def __init__(self):
+        def compute_td_error(
+            obs_t, act_t, rew_t, obs_tp1, terminateds_mask, importance_weights
+        ):
+            input_dict = self._lazy_tensor_dict({SampleBatch.CUR_OBS: obs_t})
+            input_dict[SampleBatch.ACTIONS] = act_t
+            input_dict[SampleBatch.REWARDS] = rew_t
+            input_dict[SampleBatch.NEXT_OBS] = obs_tp1
+            input_dict[SampleBatch.TERMINATEDS] = terminateds_mask
+            input_dict[PRIO_WEIGHTS] = importance_weights
+            # Do forward pass on loss to update td error attribute
+            build_q_losses(self, self.model, None, input_dict)
+            return self.model.tower_stats["q_loss"].td_error
+        self.compute_td_error = compute_td_error
+@OldAPIStack
+def build_q_model_and_distribution(
+    policy: Policy,
+    obs_space: gym.spaces.Space,
+    action_space: gym.spaces.Space,
+    config: AlgorithmConfigDict,
+) -> Tuple[ModelV2, TorchDistributionWrapper]:
+    """Build q_model and target_model for DQN
+    Args:
+        policy: The policy, which will use the model for optimization.
+        obs_space (gym.spaces.Space): The policy's observation space.
+        action_space (gym.spaces.Space): The policy's action space.
+        config (AlgorithmConfigDict):
+    Returns:
+        (q_model, TorchCategorical)
+            Note: The target q model will not be returned, just assigned to
+            `policy.target_model`.
+    """
+    if not isinstance(action_space, gym.spaces.Discrete):
+        raise UnsupportedSpaceException(
+            "Action space {} is not supported for DQN.".format(action_space)
+        )
+    if config["hiddens"]:
+        # try to infer the last layer size, otherwise fall back to 256
+        num_outputs = ([256] + list(config["model"]["fcnet_hiddens"]))[-1]
+        config["model"]["no_final_linear"] = True
+    else:
+        num_outputs = action_space.n
+    # TODO(sven): Move option to add LayerNorm after each Dense
+    #  generically into ModelCatalog.
+    add_layer_norm = (
+        isinstance(getattr(policy, "exploration", None), ParameterNoise)
+        or config["exploration_config"]["type"] == "ParameterNoise"
+    )
+    model = ModelCatalog.get_model_v2(
+        obs_space=obs_space,
+        action_space=action_space,
+        num_outputs=num_outputs,
+        model_config=config["model"],
+        framework="torch",
+        model_interface=DQNTorchModel,
+        name=Q_SCOPE,
+        q_hiddens=config["hiddens"],
+        dueling=config["dueling"],
+        num_atoms=config["num_atoms"],
+        use_noisy=config["noisy"],
+        v_min=config["v_min"],
+        v_max=config["v_max"],
+        sigma0=config["sigma0"],
+        # TODO(sven): Move option to add LayerNorm after each Dense
+        #  generically into ModelCatalog.
+        add_layer_norm=add_layer_norm,
+    )
+    policy.target_model = ModelCatalog.get_model_v2(
+        obs_space=obs_space,
+        action_space=action_space,
+        num_outputs=num_outputs,
+        model_config=config["model"],
+        framework="torch",
+        model_interface=DQNTorchModel,
+        name=Q_TARGET_SCOPE,
+        q_hiddens=config["hiddens"],
+        dueling=config["dueling"],
+        num_atoms=config["num_atoms"],
+        use_noisy=config["noisy"],
+        v_min=config["v_min"],
+        v_max=config["v_max"],
+        sigma0=config["sigma0"],
+        # TODO(sven): Move option to add LayerNorm after each Dense
+        #  generically into ModelCatalog.
+        add_layer_norm=add_layer_norm,
+    )
+    # Return a Torch TorchCategorical distribution where the temperature
+    # parameter is partially binded to the configured value.
+    temperature = config["categorical_distribution_temperature"]
+    return model, get_torch_categorical_class_with_temperature(temperature)
+@OldAPIStack
+def get_distribution_inputs_and_class(
+    policy: Policy,
+    model: ModelV2,
+    input_dict: SampleBatch,
+    *,
+    explore: bool = True,
+    is_training: bool = False,
+    **kwargs
+) -> Tuple[TensorType, type, List[TensorType]]:
+    q_vals = compute_q_values(
+        policy, model, input_dict, explore=explore, is_training=is_training
+    )
+    q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals
+    model.tower_stats["q_values"] = q_vals
+    # Return a Torch TorchCategorical distribution where the temperature
+    # parameter is partially binded to the configured value.
+    temperature = policy.config["categorical_distribution_temperature"]
+    return (
+        q_vals,
+        get_torch_categorical_class_with_temperature(temperature),
+        [],  # state-out
+    )
+@OldAPIStack
+def build_q_losses(policy: Policy, model, _, train_batch: SampleBatch) -> TensorType:
+    """Constructs the loss for DQNTorchPolicy.
+    Args:
+        policy: The Policy to calculate the loss for.
+        model (ModelV2): The Model to calculate the loss for.
+        train_batch: The training data.
+    Returns:
+        TensorType: A single loss tensor.
+    """
+    config = policy.config
+    # Q-network evaluation.
+    q_t, q_logits_t, q_probs_t, _ = compute_q_values(
+        policy,
+        model,
+        {"obs": train_batch[SampleBatch.CUR_OBS]},
+        explore=False,
+        is_training=True,
+    )
+    # Target Q-network evaluation.
+    q_tp1, q_logits_tp1, q_probs_tp1, _ = compute_q_values(
+        policy,
+        policy.target_models[model],
+        {"obs": train_batch[SampleBatch.NEXT_OBS]},
+        explore=False,
+        is_training=True,
+    )
+    # Q scores for actions which we know were selected in the given state.
+    one_hot_selection = F.one_hot(
+        train_batch[SampleBatch.ACTIONS].long(), policy.action_space.n
+    )
+    q_t_selected = torch.sum(
+        torch.where(q_t > FLOAT_MIN, q_t, torch.tensor(0.0, device=q_t.device))
+        * one_hot_selection,
+        1,
+    )
+    q_logits_t_selected = torch.sum(
+        q_logits_t * torch.unsqueeze(one_hot_selection, -1), 1
+    )
+    # compute estimate of best possible value starting from state at t + 1
+    if config["double_q"]:
+        (
+            q_tp1_using_online_net,
+            q_logits_tp1_using_online_net,
+            q_dist_tp1_using_online_net,
+            _,
+        ) = compute_q_values(
+            policy,
+            model,
+            {"obs": train_batch[SampleBatch.NEXT_OBS]},
+            explore=False,
+            is_training=True,
+        )
+        q_tp1_best_using_online_net = torch.argmax(q_tp1_using_online_net, 1)
+        q_tp1_best_one_hot_selection = F.one_hot(
+            q_tp1_best_using_online_net, policy.action_space.n
+        )
+        q_tp1_best = torch.sum(
+            torch.where(
+                q_tp1 > FLOAT_MIN, q_tp1, torch.tensor(0.0, device=q_tp1.device)
+            )
+            * q_tp1_best_one_hot_selection,
+            1,
+        )
+        q_probs_tp1_best = torch.sum(
+            q_probs_tp1 * torch.unsqueeze(q_tp1_best_one_hot_selection, -1), 1
+        )
+    else:
+        q_tp1_best_one_hot_selection = F.one_hot(
+            torch.argmax(q_tp1, 1), policy.action_space.n
+        )
+        q_tp1_best = torch.sum(
+            torch.where(
+                q_tp1 > FLOAT_MIN, q_tp1, torch.tensor(0.0, device=q_tp1.device)
+            )
+            * q_tp1_best_one_hot_selection,
+            1,
+        )
+        q_probs_tp1_best = torch.sum(
+            q_probs_tp1 * torch.unsqueeze(q_tp1_best_one_hot_selection, -1), 1
+        )
+    loss_fn = huber_loss if policy.config["td_error_loss_fn"] == "huber" else l2_loss
+    q_loss = QLoss(
+        q_t_selected,
+        q_logits_t_selected,
+        q_tp1_best,
+        q_probs_tp1_best,
+        train_batch[PRIO_WEIGHTS],
+        train_batch[SampleBatch.REWARDS],
+        train_batch[SampleBatch.TERMINATEDS].float(),
+        config["gamma"],
+        config["n_step"],
+        config["num_atoms"],
+        config["v_min"],
+        config["v_max"],
+        loss_fn,
+    )
+    # Store values for stats function in model (tower), such that for
+    # multi-GPU, we do not override them during the parallel loss phase.
+    model.tower_stats["td_error"] = q_loss.td_error
+    # TD-error tensor in final stats
+    # will be concatenated and retrieved for each individual batch item.
+    model.tower_stats["q_loss"] = q_loss
+    return q_loss.loss
+@OldAPIStack
+def adam_optimizer(
+    policy: Policy, config: AlgorithmConfigDict
+) -> "torch.optim.Optimizer":
+    # By this time, the models have been moved to the GPU - if any - and we
+    # can define our optimizers using the correct CUDA variables.
+    if not hasattr(policy, "q_func_vars"):
+        policy.q_func_vars = policy.model.variables()
+    return torch.optim.Adam(
+        policy.q_func_vars, lr=policy.cur_lr, eps=config["adam_epsilon"]
+    )
+@OldAPIStack
+def build_q_stats(policy: Policy, batch) -> Dict[str, TensorType]:
+    stats = {}
+    for stats_key in policy.model_gpu_towers[0].tower_stats["q_loss"].stats.keys():
+        stats[stats_key] = torch.mean(
+            torch.stack(
+                [
+                    t.tower_stats["q_loss"].stats[stats_key].to(policy.device)
+                    for t in policy.model_gpu_towers
+                    if "q_loss" in t.tower_stats
+                ]
+            )
+        )
+    stats["cur_lr"] = policy.cur_lr
+    return stats
+@OldAPIStack
+def setup_early_mixins(
+    policy: Policy, obs_space, action_space, config: AlgorithmConfigDict
+) -> None:
+    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+@OldAPIStack
+def before_loss_init(
+    policy: Policy,
+    obs_space: gym.spaces.Space,
+    action_space: gym.spaces.Space,
+    config: AlgorithmConfigDict,
+) -> None:
+    ComputeTDErrorMixin.__init__(policy)
+    TargetNetworkMixin.__init__(policy)
+@OldAPIStack
+def compute_q_values(
+    policy: Policy,
+    model: ModelV2,
+    input_dict,
+    state_batches=None,
+    seq_lens=None,
+    explore=None,
+    is_training: bool = False,
+):
+    config = policy.config
+    model_out, state = model(input_dict, state_batches or [], seq_lens)
+    if config["num_atoms"] > 1:
+        (
+            action_scores,
+            z,
+            support_logits_per_action,
+            logits,
+            probs_or_logits,
+        ) = model.get_q_value_distributions(model_out)
+    else:
+        (action_scores, logits, probs_or_logits) = model.get_q_value_distributions(
+            model_out
+        )
+    if config["dueling"]:
+        state_score = model.get_state_value(model_out)
+        if policy.config["num_atoms"] > 1:
+            support_logits_per_action_mean = torch.mean(
+                support_logits_per_action, dim=1
+            )
+            support_logits_per_action_centered = (
+                support_logits_per_action
+                - torch.unsqueeze(support_logits_per_action_mean, dim=1)
+            )
+            support_logits_per_action = (
+                torch.unsqueeze(state_score, dim=1) + support_logits_per_action_centered
+            )
+            support_prob_per_action = nn.functional.softmax(
+                support_logits_per_action, dim=-1
+            )
+            value = torch.sum(z * support_prob_per_action, dim=-1)
+            logits = support_logits_per_action
+            probs_or_logits = support_prob_per_action
+        else:
+            advantages_mean = reduce_mean_ignore_inf(action_scores, 1)
+            advantages_centered = action_scores - torch.unsqueeze(advantages_mean, 1)
+            value = state_score + advantages_centered
+    else:
+        value = action_scores
+    return value, logits, probs_or_logits, state
+@OldAPIStack
+def grad_process_and_td_error_fn(
+    policy: Policy, optimizer: "torch.optim.Optimizer", loss: TensorType
+) -> Dict[str, TensorType]:
+    # Clip grads if configured.
+    return apply_grad_clipping(policy, optimizer, loss)
+@OldAPIStack
+def extra_action_out_fn(
+    policy: Policy, input_dict, state_batches, model, action_dist
+) -> Dict[str, TensorType]:
+    return {"q_values": model.tower_stats["q_values"]}
+DQNTorchPolicy = build_policy_class(
+    name="DQNTorchPolicy",
+    framework="torch",
+    loss_fn=build_q_losses,
+    get_default_config=lambda: ray.rllib.algorithms.dqn.dqn.DQNConfig(),
+    make_model_and_action_dist=build_q_model_and_distribution,
+    action_distribution_fn=get_distribution_inputs_and_class,
+    stats_fn=build_q_stats,
+    postprocess_fn=postprocess_nstep_and_prio,
+    optimizer_fn=adam_optimizer,
+    extra_grad_process_fn=grad_process_and_td_error_fn,
+    extra_learn_fetches_fn=concat_multi_gpu_td_errors,
+    extra_action_out_fn=extra_action_out_fn,
+    before_init=setup_early_mixins,
+    before_loss_init=before_loss_init,
+    mixins=[
+        TargetNetworkMixin,
+        ComputeTDErrorMixin,
+        LearningRateSchedule,
+    ],
+)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (203 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/__pycache__/default_dqn_torch_rl_module.cpython-311.pyc ADDED Viewed

Binary file (13.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/__pycache__/dqn_torch_learner.cpython-311.pyc ADDED Viewed

Binary file (11.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/default_dqn_torch_rl_module.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import tree
+from typing import Dict, Union
+from ray.rllib.algorithms.dqn.default_dqn_rl_module import (
+    DefaultDQNRLModule,
+    ATOMS,
+    QF_LOGITS,
+    QF_NEXT_PREDS,
+    QF_PREDS,
+    QF_PROBS,
+    QF_TARGET_NEXT_PREDS,
+    QF_TARGET_NEXT_PROBS,
+)
+from ray.rllib.algorithms.dqn.dqn_catalog import DQNCatalog
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.models.base import Encoder, ENCODER_OUT, Model
+from ray.rllib.core.rl_module.apis.q_net_api import QNetAPI
+from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType, TensorStructType
+from ray.util.annotations import DeveloperAPI
+torch, nn = try_import_torch()
+@DeveloperAPI
+class DefaultDQNTorchRLModule(TorchRLModule, DefaultDQNRLModule):
+    framework: str = "torch"
+    def __init__(self, *args, **kwargs):
+        catalog_class = kwargs.pop("catalog_class", None)
+        if catalog_class is None:
+            catalog_class = DQNCatalog
+        super().__init__(*args, **kwargs, catalog_class=catalog_class)
+    @override(RLModule)
+    def _forward_inference(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]:
+        # Q-network forward pass.
+        qf_outs = self.compute_q_values(batch)
+        # Get action distribution.
+        action_dist_cls = self.get_exploration_action_dist_cls()
+        action_dist = action_dist_cls.from_logits(qf_outs[QF_PREDS])
+        # Note, the deterministic version of the categorical distribution
+        # outputs directly the `argmax` of the logits.
+        exploit_actions = action_dist.to_deterministic().sample()
+        output = {Columns.ACTIONS: exploit_actions}
+        if Columns.STATE_OUT in qf_outs:
+            output[Columns.STATE_OUT] = qf_outs[Columns.STATE_OUT]
+        # In inference, we only need the exploitation actions.
+        return output
+    @override(RLModule)
+    def _forward_exploration(
+        self, batch: Dict[str, TensorType], t: int
+    ) -> Dict[str, TensorType]:
+        # Define the return dictionary.
+        output = {}
+        # Q-network forward pass.
+        qf_outs = self.compute_q_values(batch)
+        # Get action distribution.
+        action_dist_cls = self.get_exploration_action_dist_cls()
+        action_dist = action_dist_cls.from_logits(qf_outs[QF_PREDS])
+        # Note, the deterministic version of the categorical distribution
+        # outputs directly the `argmax` of the logits.
+        exploit_actions = action_dist.to_deterministic().sample()
+        # We need epsilon greedy to support exploration.
+        # TODO (simon): Implement sampling for nested spaces.
+        # Update scheduler.
+        self.epsilon_schedule.update(t)
+        # Get the actual epsilon,
+        epsilon = self.epsilon_schedule.get_current_value()
+        # Apply epsilon-greedy exploration.
+        B = qf_outs[QF_PREDS].shape[0]
+        random_actions = torch.squeeze(
+            torch.multinomial(
+                (
+                    torch.nan_to_num(
+                        qf_outs[QF_PREDS].reshape(-1, qf_outs[QF_PREDS].size(-1)),
+                        neginf=0.0,
+                    )
+                    != 0.0
+                ).float(),
+                num_samples=1,
+            ),
+            dim=1,
+        )
+        actions = torch.where(
+            torch.rand((B,)) < epsilon,
+            random_actions,
+            exploit_actions,
+        )
+        # Add the actions to the return dictionary.
+        output[Columns.ACTIONS] = actions
+        # If this is a stateful module, add output states.
+        if Columns.STATE_OUT in qf_outs:
+            output[Columns.STATE_OUT] = qf_outs[Columns.STATE_OUT]
+        return output
+    @override(RLModule)
+    def _forward_train(
+        self, batch: Dict[str, TensorType]
+    ) -> Dict[str, TensorStructType]:
+        if self.inference_only:
+            raise RuntimeError(
+                "Trying to train a module that is not a learner module. Set the "
+                "flag `inference_only=False` when building the module."
+            )
+        output = {}
+        # If we use a double-Q setup.
+        if self.uses_double_q:
+            # Then we need to make a single forward pass with both,
+            # current and next observations.
+            batch_base = {
+                Columns.OBS: torch.concat(
+                    [batch[Columns.OBS], batch[Columns.NEXT_OBS]], dim=0
+                ),
+            }
+            # If this is a stateful module add the input states.
+            if Columns.STATE_IN in batch:
+                # Add both, the input state for the actual observation and
+                # the one for the next observation.
+                batch_base.update(
+                    {
+                        Columns.STATE_IN: tree.map_structure(
+                            lambda t1, t2: torch.cat([t1, t2], dim=0),
+                            batch[Columns.STATE_IN],
+                            batch[Columns.NEXT_STATE_IN],
+                        )
+                    }
+                )
+        # Otherwise we can just use the current observations.
+        else:
+            batch_base = {Columns.OBS: batch[Columns.OBS]}
+            # If this is a stateful module add the input state.
+            if Columns.STATE_IN in batch:
+                batch_base.update({Columns.STATE_IN: batch[Columns.STATE_IN]})
+        batch_target = {Columns.OBS: batch[Columns.NEXT_OBS]}
+        # If we have a stateful encoder, add the states for the target forward
+        # pass.
+        if Columns.NEXT_STATE_IN in batch:
+            batch_target.update({Columns.STATE_IN: batch[Columns.NEXT_STATE_IN]})
+        # Q-network forward passes.
+        qf_outs = self.compute_q_values(batch_base)
+        if self.uses_double_q:
+            output[QF_PREDS], output[QF_NEXT_PREDS] = torch.chunk(
+                qf_outs[QF_PREDS], chunks=2, dim=0
+            )
+        else:
+            output[QF_PREDS] = qf_outs[QF_PREDS]
+        # The target Q-values for the next observations.
+        qf_target_next_outs = self.forward_target(batch_target)
+        output[QF_TARGET_NEXT_PREDS] = qf_target_next_outs[QF_PREDS]
+        # We are learning a Q-value distribution.
+        if self.num_atoms > 1:
+            # Add distribution artefacts to the output.
+            # Distribution support.
+            output[ATOMS] = qf_target_next_outs[ATOMS]
+            # Original logits from the Q-head.
+            output[QF_LOGITS] = qf_outs[QF_LOGITS]
+            # Probabilities of the Q-value distribution of the current state.
+            output[QF_PROBS] = qf_outs[QF_PROBS]
+            # Probabilities of the target Q-value distribution of the next state.
+            output[QF_TARGET_NEXT_PROBS] = qf_target_next_outs[QF_PROBS]
+        # Add the states to the output, if the module is stateful.
+        if Columns.STATE_OUT in qf_outs:
+            output[Columns.STATE_OUT] = qf_outs[Columns.STATE_OUT]
+        # For correctness, also add the output states from the target forward pass.
+        # Note, we do not backpropagate through this state.
+        if Columns.STATE_OUT in qf_target_next_outs:
+            output[Columns.NEXT_STATE_OUT] = qf_target_next_outs[Columns.STATE_OUT]
+        return output
+    @override(QNetAPI)
+    def compute_advantage_distribution(
+        self,
+        batch: Dict[str, TensorType],
+    ) -> Dict[str, TensorType]:
+        output = {}
+        # Distributional Q-learning uses a discrete support `z`
+        # to represent the action value distribution.
+        # TODO (simon): Check, if we still need here the device for torch.
+        z = torch.arange(0.0, self.num_atoms, dtype=torch.float32).to(
+            batch.device,
+        )
+        # Rescale the support.
+        z = self.v_min + z * (self.v_max - self.v_min) / float(self.num_atoms - 1)
+        # Reshape the action values.
+        # NOTE: Handcrafted action shape.
+        logits_per_action_per_atom = torch.reshape(
+            batch, shape=(*batch.shape[:-1], self.action_space.n, self.num_atoms)
+        )
+        # Calculate the probability for each action value atom. Note,
+        # the sum along action value atoms of a single action value
+        # must sum to one.
+        prob_per_action_per_atom = nn.functional.softmax(
+            logits_per_action_per_atom,
+            dim=-1,
+        )
+        # Compute expected action value by weighted sum.
+        output[ATOMS] = z
+        output["logits"] = logits_per_action_per_atom
+        output["probs"] = prob_per_action_per_atom
+        return output
+    # TODO (simon): Test, if providing the function with a `return_probs`
+    #  improves performance significantly.
+    @override(DefaultDQNRLModule)
+    def _qf_forward_helper(
+        self,
+        batch: Dict[str, TensorType],
+        encoder: Encoder,
+        head: Union[Model, Dict[str, Model]],
+    ) -> Dict[str, TensorType]:
+        """Computes Q-values.
+        This is a helper function that takes care of all different cases,
+        i.e. if we use a dueling architecture or not and if we use distributional
+        Q-learning or not.
+        Args:
+            batch: The batch received in the forward pass.
+            encoder: The encoder network to use. Here we have a single encoder
+                for all heads (Q or advantages and value in case of a dueling
+                architecture).
+            head: Either a head model or a dictionary of head model (dueling
+            architecture) containing advantage and value stream heads.
+        Returns:
+            In case of expectation learning the Q-value predictions ("qf_preds")
+            and in case of distributional Q-learning in addition to the predictions
+            the atoms ("atoms"), the Q-value predictions ("qf_preds"), the Q-logits
+            ("qf_logits") and the probabilities for the support atoms ("qf_probs").
+        """
+        output = {}
+        # Encoder forward pass.
+        encoder_outs = encoder(batch)
+        # Do we have a dueling architecture.
+        if self.uses_dueling:
+            # Head forward passes for advantage and value stream.
+            qf_outs = head["af"](encoder_outs[ENCODER_OUT])
+            vf_outs = head["vf"](encoder_outs[ENCODER_OUT])
+            # We learn a Q-value distribution.
+            if self.num_atoms > 1:
+                # Compute the advantage stream distribution.
+                af_dist_output = self.compute_advantage_distribution(qf_outs)
+                # Center the advantage stream distribution.
+                centered_af_logits = af_dist_output["logits"] - af_dist_output[
+                    "logits"
+                ].mean(dim=-1, keepdim=True)
+                # Calculate the Q-value distribution by adding advantage and
+                # value stream.
+                qf_logits = centered_af_logits + vf_outs.view(
+                    -1, *((1,) * (centered_af_logits.dim() - 1))
+                )
+                # Calculate probabilites for the Q-value distribution along
+                # the support given by the atoms.
+                qf_probs = nn.functional.softmax(qf_logits, dim=-1)
+                # Return also the support as we need it in the learner.
+                output[ATOMS] = af_dist_output[ATOMS]
+                # Calculate the Q-values by the weighted sum over the atoms.
+                output[QF_PREDS] = torch.sum(af_dist_output[ATOMS] * qf_probs, dim=-1)
+                output[QF_LOGITS] = qf_logits
+                output[QF_PROBS] = qf_probs
+            # Otherwise we learn an expectation.
+            else:
+                # Center advantages. Note, we cannot do an in-place operation here
+                # b/c we backpropagate through these values. See for a discussion
+                # https://discuss.pytorch.org/t/gradient-computation-issue-due-to-
+                # inplace-operation-unsure-how-to-debug-for-custom-model/170133
+                # Has to be a mean for each batch element.
+                af_outs_mean = torch.nan_to_num(qf_outs, neginf=torch.nan).nanmean(
+                    dim=-1, keepdim=True
+                )
+                qf_outs = qf_outs - af_outs_mean
+                # Add advantage and value stream. Note, we broadcast here.
+                output[QF_PREDS] = qf_outs + vf_outs
+        # No dueling architecture.
+        else:
+            # Note, in this case the advantage network is the Q-network.
+            # Forward pass through Q-head.
+            qf_outs = head(encoder_outs[ENCODER_OUT])
+            # We learn a Q-value distribution.
+            if self.num_atoms > 1:
+                # Note in a non-dueling architecture the advantage distribution is
+                # the Q-value distribution.
+                # Get the Q-value distribution.
+                qf_dist_outs = self.compute_advantage_distribution(qf_outs)
+                # Get the support of the Q-value distribution.
+                output[ATOMS] = qf_dist_outs[ATOMS]
+                # Calculate the Q-values by the weighted sum over the atoms.
+                output[QF_PREDS] = torch.sum(
+                    qf_dist_outs[ATOMS] * qf_dist_outs["probs"], dim=-1
+                )
+                output[QF_LOGITS] = qf_dist_outs["logits"]
+                output[QF_PROBS] = qf_dist_outs["probs"]
+            # Otherwise we learn an expectation.
+            else:
+                # In this case we have a Q-head of dimension (1, action_space.n).
+                output[QF_PREDS] = qf_outs
+        # If we have a stateful encoder add the output states to the return
+        # dictionary.
+        if Columns.STATE_OUT in encoder_outs:
+            output[Columns.STATE_OUT] = encoder_outs[Columns.STATE_OUT]
+        return output

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dqn/torch/dqn_torch_learner.py ADDED Viewed

	@@ -0,0 +1,295 @@

+from typing import Dict
+from ray.rllib.algorithms.dqn.dqn import DQNConfig
+from ray.rllib.algorithms.dqn.dqn_learner import (
+    ATOMS,
+    DQNLearner,
+    QF_LOSS_KEY,
+    QF_LOGITS,
+    QF_MEAN_KEY,
+    QF_MAX_KEY,
+    QF_MIN_KEY,
+    QF_NEXT_PREDS,
+    QF_TARGET_NEXT_PREDS,
+    QF_TARGET_NEXT_PROBS,
+    QF_PREDS,
+    QF_PROBS,
+    TD_ERROR_MEAN_KEY,
+)
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.metrics import TD_ERROR_KEY
+from ray.rllib.utils.typing import ModuleID, TensorType
+torch, nn = try_import_torch()
+class DQNTorchLearner(DQNLearner, TorchLearner):
+    """Implements `torch`-specific DQN Rainbow loss logic on top of `DQNLearner`
+    This ' Learner' class implements the loss in its
+    `self.compute_loss_for_module()` method.
+    """
+    @override(TorchLearner)
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: DQNConfig,
+        batch: Dict,
+        fwd_out: Dict[str, TensorType]
+    ) -> TensorType:
+        # Possibly apply masking to some sub loss terms and to the total loss term
+        # at the end. Masking could be used for RNN-based model (zero padded `batch`)
+        # and for PPO's batched value function (and bootstrap value) computations,
+        # for which we add an (artificial) timestep to each episode to
+        # simplify the actual computation.
+        if Columns.LOSS_MASK in batch:
+            mask = batch[Columns.LOSS_MASK].clone()
+            # Check, if a burn-in should be used to recover from a poor state.
+            if self.config.burn_in_len > 0:
+                # Train only on the timesteps after the burn-in period.
+                mask[:, : self.config.burn_in_len] = False
+            num_valid = torch.sum(mask)
+            def possibly_masked_mean(data_):
+                return torch.sum(data_[mask]) / num_valid
+            def possibly_masked_min(data_):
+                # Prevent minimum over empty tensors, which can happened
+                # when all elements in the mask are `False`.
+                return (
+                    torch.tensor(float("nan"))
+                    if data_[mask].numel() == 0
+                    else torch.min(data_[mask])
+                )
+            def possibly_masked_max(data_):
+                # Prevent maximum over empty tensors, which can happened
+                # when all elements in the mask are `False`.
+                return (
+                    torch.tensor(float("nan"))
+                    if data_[mask].numel() == 0
+                    else torch.max(data_[mask])
+                )
+        else:
+            possibly_masked_mean = torch.mean
+            possibly_masked_min = torch.min
+            possibly_masked_max = torch.max
+        q_curr = fwd_out[QF_PREDS]
+        q_target_next = fwd_out[QF_TARGET_NEXT_PREDS]
+        # Get the Q-values for the selected actions in the rollout.
+        # TODO (simon, sven): Check, if we can use `gather` with a complex action
+        # space - we might need the one_hot_selection. Also test performance.
+        q_selected = torch.nan_to_num(
+            torch.gather(
+                q_curr,
+                dim=-1,
+                index=batch[Columns.ACTIONS]
+                .view(*batch[Columns.ACTIONS].shape, 1)
+                .long(),
+            ),
+            neginf=0.0,
+        ).squeeze(dim=-1)
+        # Use double Q learning.
+        if config.double_q:
+            # Then we evaluate the target Q-function at the best action (greedy action)
+            # over the online Q-function.
+            # Mark the best online Q-value of the next state.
+            q_next_best_idx = (
+                torch.argmax(fwd_out[QF_NEXT_PREDS], dim=-1).unsqueeze(dim=-1).long()
+            )
+            # Get the Q-value of the target network at maximum of the online network
+            # (bootstrap action).
+            q_next_best = torch.nan_to_num(
+                torch.gather(q_target_next, dim=-1, index=q_next_best_idx),
+                neginf=0.0,
+            ).squeeze()
+        else:
+            # Mark the maximum Q-value(s).
+            q_next_best_idx = (
+                torch.argmax(q_target_next, dim=-1).unsqueeze(dim=-1).long()
+            )
+            # Get the maximum Q-value(s).
+            q_next_best = torch.nan_to_num(
+                torch.gather(q_target_next, dim=-1, index=q_next_best_idx),
+                neginf=0.0,
+            ).squeeze()
+        # If we learn a Q-distribution.
+        if config.num_atoms > 1:
+            # Extract the Q-logits evaluated at the selected actions.
+            # (Note, `torch.gather` should be faster than multiplication
+            # with a one-hot tensor.)
+            # (32, 2, 10) -> (32, 10)
+            q_logits_selected = torch.gather(
+                fwd_out[QF_LOGITS],
+                dim=1,
+                # Note, the Q-logits are of shape (B, action_space.n, num_atoms)
+                # while the actions have shape (B, 1). We reshape actions to
+                # (B, 1, num_atoms).
+                index=batch[Columns.ACTIONS]
+                .view(-1, 1, 1)
+                .expand(-1, 1, config.num_atoms)
+                .long(),
+            ).squeeze(dim=1)
+            # Get the probabilies for the maximum Q-value(s).
+            q_probs_next_best = torch.gather(
+                fwd_out[QF_TARGET_NEXT_PROBS],
+                dim=1,
+                # Change the view and then expand to get to the dimensions
+                # of the probabilities (dims 0 and 2, 1 should be reduced
+                # from 2 -> 1).
+                index=q_next_best_idx.view(-1, 1, 1).expand(-1, 1, config.num_atoms),
+            ).squeeze(dim=1)
+            # For distributional Q-learning we use an entropy loss.
+            # Extract the support grid for the Q distribution.
+            z = fwd_out[ATOMS]
+            # TODO (simon): Enable computing on GPU.
+            # (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms)s
+            r_tau = torch.clamp(
+                batch[Columns.REWARDS].unsqueeze(dim=-1)
+                + (
+                    config.gamma ** batch["n_step"]
+                    * (1.0 - batch[Columns.TERMINATEDS].float())
+                ).unsqueeze(dim=-1)
+                * z,
+                config.v_min,
+                config.v_max,
+            ).squeeze(dim=1)
+            # (32, 10)
+            b = (r_tau - config.v_min) / (
+                (config.v_max - config.v_min) / float(config.num_atoms - 1.0)
+            )
+            lower_bound = torch.floor(b)
+            upper_bound = torch.ceil(b)
+            floor_equal_ceil = ((upper_bound - lower_bound) < 0.5).float()
+            # (B, num_atoms, num_atoms).
+            lower_projection = nn.functional.one_hot(
+                lower_bound.long(), config.num_atoms
+            )
+            upper_projection = nn.functional.one_hot(
+                upper_bound.long(), config.num_atoms
+            )
+            # (32, 10)
+            ml_delta = q_probs_next_best * (upper_bound - b + floor_equal_ceil)
+            mu_delta = q_probs_next_best * (b - lower_bound)
+            # (32, 10)
+            ml_delta = torch.sum(lower_projection * ml_delta.unsqueeze(dim=-1), dim=1)
+            mu_delta = torch.sum(upper_projection * mu_delta.unsqueeze(dim=-1), dim=1)
+            # We do not want to propagate through the distributional targets.
+            # (32, 10)
+            m = (ml_delta + mu_delta).detach()
+            # The Rainbow paper claims to use the KL-divergence loss. This is identical
+            # to using the cross-entropy (differs only by entropy which is constant)
+            # when optimizing by the gradient (the gradient is identical).
+            td_error = nn.CrossEntropyLoss(reduction="none")(q_logits_selected, m)
+            # Compute the weighted loss (importance sampling weights).
+            total_loss = torch.mean(batch["weights"] * td_error)
+        else:
+            # Masked all Q-values with terminated next states in the targets.
+            q_next_best_masked = (
+                1.0 - batch[Columns.TERMINATEDS].float()
+            ) * q_next_best
+            # Compute the RHS of the Bellman equation.
+            # Detach this node from the computation graph as we do not want to
+            # backpropagate through the target network when optimizing the Q loss.
+            q_selected_target = (
+                batch[Columns.REWARDS]
+                + (config.gamma ** batch["n_step"]) * q_next_best_masked
+            ).detach()
+            # Choose the requested loss function. Note, in case of the Huber loss
+            # we fall back to the default of `delta=1.0`.
+            loss_fn = nn.HuberLoss if config.td_error_loss_fn == "huber" else nn.MSELoss
+            # Compute the TD error.
+            td_error = torch.abs(q_selected - q_selected_target)
+            # Compute the weighted loss (importance sampling weights).
+            total_loss = possibly_masked_mean(
+                batch["weights"]
+                * loss_fn(reduction="none")(q_selected, q_selected_target)
+            )
+        # Log the TD-error with reduce=None, such that - in case we have n parallel
+        # Learners - we will re-concatenate the produced TD-error tensors to yield
+        # a 1:1 representation of the original batch.
+        self.metrics.log_value(
+            key=(module_id, TD_ERROR_KEY),
+            value=td_error,
+            reduce=None,
+            clear_on_reduce=True,
+        )
+        # Log other important loss stats (reduce=mean (default), but with window=1
+        # in order to keep them history free).
+        self.metrics.log_dict(
+            {
+                QF_LOSS_KEY: total_loss,
+                QF_MEAN_KEY: possibly_masked_mean(q_selected),
+                QF_MAX_KEY: possibly_masked_max(q_selected),
+                QF_MIN_KEY: possibly_masked_min(q_selected),
+                TD_ERROR_MEAN_KEY: possibly_masked_mean(td_error),
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        # If we learn a Q-value distribution store the support and average
+        # probabilities.
+        if config.num_atoms > 1:
+            # Log important loss stats.
+            self.metrics.log_dict(
+                {
+                    ATOMS: z,
+                    # The absolute difference in expectation between the actions
+                    # should (at least mildly) rise.
+                    "expectations_abs_diff": torch.mean(
+                        torch.abs(
+                            torch.diff(
+                                torch.sum(fwd_out[QF_PROBS].mean(dim=0) * z, dim=1)
+                            ).mean(dim=0)
+                        )
+                    ),
+                    # The total variation distance should measure the distance between
+                    # return distributions of different actions. This should (at least
+                    # mildly) increase during training when the agent differentiates
+                    # more between actions.
+                    "dist_total_variation_dist": torch.diff(
+                        fwd_out[QF_PROBS].mean(dim=0), dim=0
+                    )
+                    .abs()
+                    .sum()
+                    * 0.5,
+                    # The maximum distance between the action distributions. This metric
+                    # should increase over the course of training.
+                    "dist_max_abs_distance": torch.max(
+                        torch.diff(fwd_out[QF_PROBS].mean(dim=0), dim=0).abs()
+                    ),
+                    # Mean shannon entropy of action distributions. This should decrease
+                    # over the course of training.
+                    "action_dist_mean_entropy": torch.mean(
+                        (
+                            fwd_out[QF_PROBS].mean(dim=0)
+                            * torch.log(fwd_out[QF_PROBS].mean(dim=0))
+                        ).sum(dim=1),
+                        dim=0,
+                    ),
+                },
+                key=module_id,
+                window=1,  # <- single items (should not be mean/ema-reduced over time).
+            )
+        return total_loss

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from ray.rllib.algorithms.marwil.marwil import (
+    MARWIL,
+    MARWILConfig,
+)
+from ray.rllib.algorithms.marwil.marwil_tf_policy import (
+    MARWILTF1Policy,
+    MARWILTF2Policy,
+)
+from ray.rllib.algorithms.marwil.marwil_torch_policy import MARWILTorchPolicy
+__all__ = [
+    "MARWIL",
+    "MARWILConfig",
+    # @OldAPIStack
+    "MARWILTF1Policy",
+    "MARWILTF2Policy",
+    "MARWILTorchPolicy",
+]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (639 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/marwil.cpython-311.pyc ADDED Viewed

Binary file (21.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/marwil_learner.cpython-311.pyc ADDED Viewed

Binary file (2.96 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/marwil_tf_policy.cpython-311.pyc ADDED Viewed

Binary file (10.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/__pycache__/marwil_torch_policy.cpython-311.pyc ADDED Viewed

Binary file (6.86 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil.py ADDED Viewed

	@@ -0,0 +1,540 @@

+from typing import Callable, Optional, Type, Union
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.connectors.learner import (
+    AddObservationsFromEpisodesToBatch,
+    AddOneTsToEpisodesAndTruncate,
+    AddNextObservationsFromEpisodesToTrainBatch,
+    GeneralAdvantageEstimation,
+)
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.execution.rollout_ops import (
+    synchronous_parallel_sample,
+)
+from ray.rllib.execution.train_ops import (
+    multi_gpu_train_one_step,
+    train_one_step,
+)
+from ray.rllib.policy.policy import Policy
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.rllib.utils.metrics import (
+    ALL_MODULES,
+    LEARNER_RESULTS,
+    LEARNER_UPDATE_TIMER,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED,
+    OFFLINE_SAMPLING_TIMER,
+    SAMPLE_TIMER,
+    SYNCH_WORKER_WEIGHTS_TIMER,
+    TIMERS,
+)
+from ray.rllib.utils.typing import (
+    EnvType,
+    ResultDict,
+    RLModuleSpecType,
+)
+from ray.tune.logger import Logger
+class MARWILConfig(AlgorithmConfig):
+    """Defines a configuration class from which a MARWIL Algorithm can be built.
+    .. testcode::
+        import gymnasium as gym
+        import numpy as np
+        from pathlib import Path
+        from ray.rllib.algorithms.marwil import MARWILConfig
+        # Get the base path (to ray/rllib)
+        base_path = Path(__file__).parents[2]
+        # Get the path to the data in rllib folder.
+        data_path = base_path / "tests/data/cartpole/cartpole-v1_large"
+        config = MARWILConfig()
+        # Enable the new API stack.
+        config.api_stack(
+            enable_rl_module_and_learner=True,
+            enable_env_runner_and_connector_v2=True,
+        )
+        # Define the environment for which to learn a policy
+        # from offline data.
+        config.environment(
+            observation_space=gym.spaces.Box(
+                np.array([-4.8, -np.inf, -0.41887903, -np.inf]),
+                np.array([4.8, np.inf, 0.41887903, np.inf]),
+                shape=(4,),
+                dtype=np.float32,
+            ),
+            action_space=gym.spaces.Discrete(2),
+        )
+        # Set the training parameters.
+        config.training(
+            beta=1.0,
+            lr=1e-5,
+            gamma=0.99,
+            # We must define a train batch size for each
+            # learner (here 1 local learner).
+            train_batch_size_per_learner=2000,
+        )
+        # Define the data source for offline data.
+        config.offline_data(
+            input_=[data_path.as_posix()],
+            # Run exactly one update per training iteration.
+            dataset_num_iters_per_learner=1,
+        )
+        # Build an `Algorithm` object from the config and run 1 training
+        # iteration.
+        algo = config.build()
+        algo.train()
+    .. testcode::
+        import gymnasium as gym
+        import numpy as np
+        from pathlib import Path
+        from ray.rllib.algorithms.marwil import MARWILConfig
+        from ray import train, tune
+        # Get the base path (to ray/rllib)
+        base_path = Path(__file__).parents[2]
+        # Get the path to the data in rllib folder.
+        data_path = base_path / "tests/data/cartpole/cartpole-v1_large"
+        config = MARWILConfig()
+        # Enable the new API stack.
+        config.api_stack(
+            enable_rl_module_and_learner=True,
+            enable_env_runner_and_connector_v2=True,
+        )
+        # Print out some default values
+        print(f"beta: {config.beta}")
+        # Update the config object.
+        config.training(
+            lr=tune.grid_search([1e-3, 1e-4]),
+            beta=0.75,
+            # We must define a train batch size for each
+            # learner (here 1 local learner).
+            train_batch_size_per_learner=2000,
+        )
+        # Set the config's data path.
+        config.offline_data(
+            input_=[data_path.as_posix()],
+            # Set the number of updates to be run per learner
+            # per training step.
+            dataset_num_iters_per_learner=1,
+        )
+        # Set the config's environment for evalaution.
+        config.environment(
+            observation_space=gym.spaces.Box(
+                np.array([-4.8, -np.inf, -0.41887903, -np.inf]),
+                np.array([4.8, np.inf, 0.41887903, np.inf]),
+                shape=(4,),
+                dtype=np.float32,
+            ),
+            action_space=gym.spaces.Discrete(2),
+        )
+        # Set up a tuner to run the experiment.
+        tuner = tune.Tuner(
+            "MARWIL",
+            param_space=config,
+            run_config=train.RunConfig(
+                stop={"training_iteration": 1},
+            ),
+        )
+        # Run the experiment.
+        tuner.fit()
+    """
+    def __init__(self, algo_class=None):
+        """Initializes a MARWILConfig instance."""
+        self.exploration_config = {
+            # The Exploration class to use. In the simplest case, this is the name
+            # (str) of any class present in the `rllib.utils.exploration` package.
+            # You can also provide the python class directly or the full location
+            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
+            # EpsilonGreedy").
+            "type": "StochasticSampling",
+            # Add constructor kwargs here (if any).
+        }
+        super().__init__(algo_class=algo_class or MARWIL)
+        # fmt: off
+        # __sphinx_doc_begin__
+        # MARWIL specific settings:
+        self.beta = 1.0
+        self.bc_logstd_coeff = 0.0
+        self.moving_average_sqd_adv_norm_update_rate = 1e-8
+        self.moving_average_sqd_adv_norm_start = 100.0
+        self.vf_coeff = 1.0
+        self.model["vf_share_layers"] = False
+        self.grad_clip = None
+        # Override some of AlgorithmConfig's default values with MARWIL-specific values.
+        # You should override input_ to point to an offline dataset
+        # (see algorithm.py and algorithm_config.py).
+        # The dataset may have an arbitrary number of timesteps
+        # (and even episodes) per line.
+        # However, each line must only contain consecutive timesteps in
+        # order for MARWIL to be able to calculate accumulated
+        # discounted returns. It is ok, though, to have multiple episodes in
+        # the same line.
+        self.input_ = "sampler"
+        self.postprocess_inputs = True
+        self.lr = 1e-4
+        self.lambda_ = 1.0
+        self.train_batch_size = 2000
+        # Materialize only the data in raw format, but not the mapped data b/c
+        # MARWIL uses a connector to calculate values and therefore the module
+        # needs to be updated frequently. This updating would not work if we
+        # map the data once at the beginning.
+        # TODO (simon, sven): The module is only updated when the OfflinePreLearner
+        #   gets reinitiated, i.e. when the iterator gets reinitiated. This happens
+        #   frequently enough with a small dataset, but with a big one this does not
+        #   update often enough. We might need to put model weigths every couple of
+        #   iterations into the object storage (maybe also connector states).
+        self.materialize_data = True
+        self.materialize_mapped_data = False
+        # __sphinx_doc_end__
+        # fmt: on
+        self._set_off_policy_estimation_methods = False
+    @override(AlgorithmConfig)
+    def training(
+        self,
+        *,
+        beta: Optional[float] = NotProvided,
+        bc_logstd_coeff: Optional[float] = NotProvided,
+        moving_average_sqd_adv_norm_update_rate: Optional[float] = NotProvided,
+        moving_average_sqd_adv_norm_start: Optional[float] = NotProvided,
+        vf_coeff: Optional[float] = NotProvided,
+        grad_clip: Optional[float] = NotProvided,
+        **kwargs,
+    ) -> "MARWILConfig":
+        """Sets the training related configuration.
+        Args:
+            beta: Scaling  of advantages in exponential terms. When beta is 0.0,
+                MARWIL is reduced to behavior cloning (imitation learning);
+                see bc.py algorithm in this same directory.
+            bc_logstd_coeff: A coefficient to encourage higher action distribution
+                entropy for exploration.
+            moving_average_sqd_adv_norm_update_rate: The rate for updating the
+                squared moving average advantage norm (c^2). A higher rate leads
+                to faster updates of this moving avergage.
+            moving_average_sqd_adv_norm_start: Starting value for the
+                squared moving average advantage norm (c^2).
+            vf_coeff: Balancing value estimation loss and policy optimization loss.
+            grad_clip: If specified, clip the global norm of gradients by this amount.
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+        if beta is not NotProvided:
+            self.beta = beta
+        if bc_logstd_coeff is not NotProvided:
+            self.bc_logstd_coeff = bc_logstd_coeff
+        if moving_average_sqd_adv_norm_update_rate is not NotProvided:
+            self.moving_average_sqd_adv_norm_update_rate = (
+                moving_average_sqd_adv_norm_update_rate
+            )
+        if moving_average_sqd_adv_norm_start is not NotProvided:
+            self.moving_average_sqd_adv_norm_start = moving_average_sqd_adv_norm_start
+        if vf_coeff is not NotProvided:
+            self.vf_coeff = vf_coeff
+        if grad_clip is not NotProvided:
+            self.grad_clip = grad_clip
+        return self
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self) -> RLModuleSpecType:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import (
+                DefaultPPOTorchRLModule,
+            )
+            return RLModuleSpec(module_class=DefaultPPOTorchRLModule)
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use 'torch' instead."
+            )
+    @override(AlgorithmConfig)
+    def get_default_learner_class(self) -> Union[Type["Learner"], str]:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.marwil.torch.marwil_torch_learner import (
+                MARWILTorchLearner,
+            )
+            return MARWILTorchLearner
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use 'torch' instead."
+            )
+    @override(AlgorithmConfig)
+    def evaluation(
+        self,
+        **kwargs,
+    ) -> "MARWILConfig":
+        """Sets the evaluation related configuration.
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        # Pass kwargs onto super's `evaluation()` method.
+        super().evaluation(**kwargs)
+        if "off_policy_estimation_methods" in kwargs:
+            # User specified their OPE methods.
+            self._set_off_policy_estimation_methods = True
+        return self
+    @override(AlgorithmConfig)
+    def offline_data(self, **kwargs) -> "MARWILConfig":
+        super().offline_data(**kwargs)
+        # Check, if the passed in class incorporates the `OfflinePreLearner`
+        # interface.
+        if "prelearner_class" in kwargs:
+            from ray.rllib.offline.offline_data import OfflinePreLearner
+            if not issubclass(kwargs.get("prelearner_class"), OfflinePreLearner):
+                raise ValueError(
+                    f"`prelearner_class` {kwargs.get('prelearner_class')} is not a "
+                    "subclass of `OfflinePreLearner`. Any class passed to "
+                    "`prelearner_class` needs to implement the interface given by "
+                    "`OfflinePreLearner`."
+                )
+        return self
+    @override(AlgorithmConfig)
+    def build(
+        self,
+        env: Optional[Union[str, EnvType]] = None,
+        logger_creator: Optional[Callable[[], Logger]] = None,
+    ) -> "Algorithm":
+        if not self._set_off_policy_estimation_methods:
+            deprecation_warning(
+                old=r"MARWIL used to have off_policy_estimation_methods "
+                "is and wis by default. This has"
+                r"changed to off_policy_estimation_methods: \{\}."
+                "If you want to use an off-policy estimator, specify it in"
+                ".evaluation(off_policy_estimation_methods=...)",
+                error=False,
+            )
+        return super().build(env, logger_creator)
+    @override(AlgorithmConfig)
+    def build_learner_connector(
+        self,
+        input_observation_space,
+        input_action_space,
+        device=None,
+    ):
+        pipeline = super().build_learner_connector(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            device=device,
+        )
+        # Before anything, add one ts to each episode (and record this in the loss
+        # mask, so that the computations at this extra ts are not used to compute
+        # the loss).
+        pipeline.prepend(AddOneTsToEpisodesAndTruncate())
+        # Prepend the "add-NEXT_OBS-from-episodes-to-train-batch" connector piece (right
+        # after the corresponding "add-OBS-..." default piece).
+        pipeline.insert_after(
+            AddObservationsFromEpisodesToBatch,
+            AddNextObservationsFromEpisodesToTrainBatch(),
+        )
+        # At the end of the pipeline (when the batch is already completed), add the
+        # GAE connector, which performs a vf forward pass, then computes the GAE
+        # computations, and puts the results of this (advantages, value targets)
+        # directly back in the batch. This is then the batch used for
+        # `forward_train` and `compute_losses`.
+        pipeline.append(
+            GeneralAdvantageEstimation(gamma=self.gamma, lambda_=self.lambda_)
+        )
+        return pipeline
+    @override(AlgorithmConfig)
+    def validate(self) -> None:
+        # Call super's validation method.
+        super().validate()
+        if self.beta < 0.0 or self.beta > 1.0:
+            self._value_error("`beta` must be within 0.0 and 1.0!")
+        if self.postprocess_inputs is False and self.beta > 0.0:
+            self._value_error(
+                "`postprocess_inputs` must be True for MARWIL (to "
+                "calculate accum., discounted returns)! Try setting "
+                "`config.offline_data(postprocess_inputs=True)`."
+            )
+        # Assert that for a local learner the number of iterations is 1. Note,
+        # this is needed because we have no iterators, but instead a single
+        # batch returned directly from the `OfflineData.sample` method.
+        if (
+            self.num_learners == 0
+            and not self.dataset_num_iters_per_learner
+            and self.enable_rl_module_and_learner
+        ):
+            self._value_error(
+                "When using a local Learner (`config.num_learners=0`), the number of "
+                "iterations per learner (`dataset_num_iters_per_learner`) has to be "
+                "defined! Set this hyperparameter through `config.offline_data("
+                "dataset_num_iters_per_learner=...)`."
+            )
+    @property
+    def _model_auto_keys(self):
+        return super()._model_auto_keys | {"beta": self.beta, "vf_share_layers": False}
+class MARWIL(Algorithm):
+    @classmethod
+    @override(Algorithm)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return MARWILConfig()
+    @classmethod
+    @override(Algorithm)
+    def get_default_policy_class(
+        cls, config: AlgorithmConfig
+    ) -> Optional[Type[Policy]]:
+        if config["framework"] == "torch":
+            from ray.rllib.algorithms.marwil.marwil_torch_policy import (
+                MARWILTorchPolicy,
+            )
+            return MARWILTorchPolicy
+        elif config["framework"] == "tf":
+            from ray.rllib.algorithms.marwil.marwil_tf_policy import (
+                MARWILTF1Policy,
+            )
+            return MARWILTF1Policy
+        else:
+            from ray.rllib.algorithms.marwil.marwil_tf_policy import MARWILTF2Policy
+            return MARWILTF2Policy
+    @override(Algorithm)
+    def training_step(self) -> None:
+        """Implements training logic for the new stack
+        Note, this includes so far training with the `OfflineData`
+        class (multi-/single-learner setup) and evaluation on
+        `EnvRunner`s. Note further, evaluation on the dataset itself
+        using estimators is not implemented, yet.
+        """
+        # Old API stack (Policy, RolloutWorker, Connector).
+        if not self.config.enable_env_runner_and_connector_v2:
+            return self._training_step_old_api_stack()
+        # TODO (simon): Take care of sampler metrics: right
+        #  now all rewards are `nan`, which possibly confuses
+        #  the user that sth. is not right, although it is as
+        #  we do not step the env.
+        with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)):
+            # Sampling from offline data.
+            batch_or_iterator = self.offline_data.sample(
+                num_samples=self.config.train_batch_size_per_learner,
+                num_shards=self.config.num_learners,
+                return_iterator=self.config.num_learners > 1,
+            )
+        with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
+            # Updating the policy.
+            # TODO (simon, sven): Check, if we should execute directly s.th. like
+            #  `LearnerGroup.update_from_iterator()`.
+            learner_results = self.learner_group._update(
+                batch=batch_or_iterator,
+                minibatch_size=self.config.train_batch_size_per_learner,
+                num_iters=self.config.dataset_num_iters_per_learner,
+                **self.offline_data.iter_batches_kwargs,
+            )
+            # Log training results.
+            self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
+        # Synchronize weights.
+        # As the results contain for each policy the loss and in addition the
+        # total loss over all policies is returned, this total loss has to be
+        # removed.
+        modules_to_update = set(learner_results[0].keys()) - {ALL_MODULES}
+        if self.eval_env_runner_group:
+            # Update weights - after learning on the local worker -
+            # on all remote workers.
+            with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
+                self.eval_env_runner_group.sync_weights(
+                    # Sync weights from learner_group to all EnvRunners.
+                    from_worker_or_learner_group=self.learner_group,
+                    policies=list(modules_to_update),
+                    inference_only=True,
+                )
+    @OldAPIStack
+    def _training_step_old_api_stack(self) -> ResultDict:
+        """Implements training step for the old stack.
+        Note, there is no hybrid stack anymore. If you need to use `RLModule`s,
+        use the new api stack.
+        """
+        # Collect SampleBatches from sample workers.
+        with self._timers[SAMPLE_TIMER]:
+            train_batch = synchronous_parallel_sample(worker_set=self.env_runner_group)
+        train_batch = train_batch.as_multi_agent(
+            module_id=list(self.config.policies)[0]
+        )
+        self._counters[NUM_AGENT_STEPS_SAMPLED] += train_batch.agent_steps()
+        self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps()
+        # Train.
+        if self.config.simple_optimizer:
+            train_results = train_one_step(self, train_batch)
+        else:
+            train_results = multi_gpu_train_one_step(self, train_batch)
+        # TODO: Move training steps counter update outside of `train_one_step()` method.
+        # # Update train step counters.
+        # self._counters[NUM_ENV_STEPS_TRAINED] += train_batch.env_steps()
+        # self._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps()
+        global_vars = {
+            "timestep": self._counters[NUM_AGENT_STEPS_SAMPLED],
+        }
+        # Update weights - after learning on the local worker - on all remote
+        # workers (only those policies that were actually trained).
+        if self.env_runner_group.num_remote_env_runners() > 0:
+            with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
+                self.env_runner_group.sync_weights(
+                    policies=list(train_results.keys()), global_vars=global_vars
+                )
+        # Update global vars on local worker as well.
+        self.env_runner.set_global_vars(global_vars)
+        return train_results

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil_learner.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from typing import Dict, Optional
+from ray.rllib.core.rl_module.apis import ValueFunctionAPI
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict
+from ray.rllib.utils.typing import ModuleID, ShouldModuleBeUpdatedFn, TensorType
+LEARNER_RESULTS_MOVING_AVG_SQD_ADV_NORM_KEY = "moving_avg_sqd_adv_norm"
+LEARNER_RESULTS_VF_EXPLAINED_VAR_KEY = "vf_explained_variance"
+# TODO (simon): Check, if the norm update should be done inside
+# the Learner.
+class MARWILLearner(Learner):
+    @override(Learner)
+    def build(self) -> None:
+        super().build()
+        # Dict mapping module IDs to the respective moving averages of squared
+        # advantages.
+        self.moving_avg_sqd_adv_norms_per_module: Dict[
+            ModuleID, TensorType
+        ] = LambdaDefaultDict(
+            lambda module_id: self._get_tensor_variable(
+                self.config.get_config_for_module(
+                    module_id
+                ).moving_average_sqd_adv_norm_start
+            )
+        )
+    @override(Learner)
+    def remove_module(
+        self,
+        module_id: ModuleID,
+        *,
+        new_should_module_be_updated: Optional[ShouldModuleBeUpdatedFn] = None,
+    ) -> None:
+        super().remove_module(
+            module_id,
+            new_should_module_be_updated=new_should_module_be_updated,
+        )
+        # In case of BC (beta==0.0 and this property never being used),
+        self.moving_avg_sqd_adv_norms_per_module.pop(module_id, None)
+    @classmethod
+    @override(Learner)
+    def rl_module_required_apis(cls) -> list[type]:
+        # In order for a PPOLearner to update an RLModule, it must implement the
+        # following APIs:
+        return [ValueFunctionAPI]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil_tf_policy.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import logging
+from typing import Any, Dict, List, Optional, Type, Union
+from ray.rllib.evaluation.postprocessing import compute_advantages, Postprocessing
+from ray.rllib.models.action_dist import ActionDistribution
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_action_dist import TFActionDistribution
+from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2
+from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.tf_mixins import (
+    ValueNetworkMixin,
+    compute_gradients,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, get_variable
+from ray.rllib.utils.tf_utils import explained_variance
+from ray.rllib.utils.typing import (
+    LocalOptimizer,
+    ModelGradients,
+    TensorType,
+)
+tf1, tf, tfv = try_import_tf()
+logger = logging.getLogger(__name__)
+class PostprocessAdvantages:
+    """Marwil's custom trajectory post-processing mixin."""
+    def __init__(self):
+        pass
+    def postprocess_trajectory(
+        self,
+        sample_batch: SampleBatch,
+        other_agent_batches: Optional[Dict[Any, SampleBatch]] = None,
+        episode=None,
+    ):
+        sample_batch = super().postprocess_trajectory(
+            sample_batch, other_agent_batches, episode
+        )
+        # Trajectory is actually complete -> last r=0.0.
+        if sample_batch[SampleBatch.TERMINATEDS][-1]:
+            last_r = 0.0
+        # Trajectory has been truncated -> last r=VF estimate of last obs.
+        else:
+            # Input dict is provided to us automatically via the Model's
+            # requirements. It's a single-timestep (last one in trajectory)
+            # input_dict.
+            # Create an input dict according to the Model's requirements.
+            index = "last" if SampleBatch.NEXT_OBS in sample_batch else -1
+            input_dict = sample_batch.get_single_step_input_dict(
+                self.view_requirements, index=index
+            )
+            last_r = self._value(**input_dict)
+        # Adds the "advantages" (which in the case of MARWIL are simply the
+        # discounted cumulative rewards) to the SampleBatch.
+        return compute_advantages(
+            sample_batch,
+            last_r,
+            self.config["gamma"],
+            # We just want the discounted cumulative rewards, so we won't need
+            # GAE nor critic (use_critic=True: Subtract vf-estimates from returns).
+            use_gae=False,
+            use_critic=False,
+        )
+class MARWILLoss:
+    def __init__(
+        self,
+        policy: Policy,
+        value_estimates: TensorType,
+        action_dist: ActionDistribution,
+        train_batch: SampleBatch,
+        vf_loss_coeff: float,
+        beta: float,
+    ):
+        # L = - A * log\pi_\theta(a|s)
+        logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS])
+        if beta != 0.0:
+            cumulative_rewards = train_batch[Postprocessing.ADVANTAGES]
+            # Advantage Estimation.
+            adv = cumulative_rewards - value_estimates
+            adv_squared = tf.reduce_mean(tf.math.square(adv))
+            # Value function's loss term (MSE).
+            self.v_loss = 0.5 * adv_squared
+            # Perform moving averaging of advantage^2.
+            rate = policy.config["moving_average_sqd_adv_norm_update_rate"]
+            # Update averaged advantage norm.
+            # Eager.
+            if policy.config["framework"] == "tf2":
+                update_term = adv_squared - policy._moving_average_sqd_adv_norm
+                policy._moving_average_sqd_adv_norm.assign_add(rate * update_term)
+                # Exponentially weighted advantages.
+                c = tf.math.sqrt(policy._moving_average_sqd_adv_norm)
+                exp_advs = tf.math.exp(beta * (adv / (1e-8 + c)))
+            # Static graph.
+            else:
+                update_adv_norm = tf1.assign_add(
+                    ref=policy._moving_average_sqd_adv_norm,
+                    value=rate * (adv_squared - policy._moving_average_sqd_adv_norm),
+                )
+                # Exponentially weighted advantages.
+                with tf1.control_dependencies([update_adv_norm]):
+                    exp_advs = tf.math.exp(
+                        beta
+                        * tf.math.divide(
+                            adv,
+                            1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm),
+                        )
+                    )
+            exp_advs = tf.stop_gradient(exp_advs)
+            self.explained_variance = tf.reduce_mean(
+                explained_variance(cumulative_rewards, value_estimates)
+            )
+        else:
+            # Value function's loss term (MSE).
+            self.v_loss = tf.constant(0.0)
+            exp_advs = 1.0
+        # logprob loss alone tends to push action distributions to
+        # have very low entropy, resulting in worse performance for
+        # unfamiliar situations.
+        # A scaled logstd loss term encourages stochasticity, thus
+        # alleviate the problem to some extent.
+        logstd_coeff = policy.config["bc_logstd_coeff"]
+        if logstd_coeff > 0.0:
+            logstds = tf.reduce_sum(action_dist.log_std, axis=1)
+        else:
+            logstds = 0.0
+        self.p_loss = -1.0 * tf.reduce_mean(
+            exp_advs * (logprobs + logstd_coeff * logstds)
+        )
+        self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss
+# We need this builder function because we want to share the same
+# custom logics between TF1 dynamic and TF2 eager policies.
+def get_marwil_tf_policy(name: str, base: type) -> type:
+    """Construct a MARWILTFPolicy inheriting either dynamic or eager base policies.
+    Args:
+        base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2.
+    Returns:
+        A TF Policy to be used with MAML.
+    """
+    class MARWILTFPolicy(ValueNetworkMixin, PostprocessAdvantages, base):
+        def __init__(
+            self,
+            observation_space,
+            action_space,
+            config,
+            existing_model=None,
+            existing_inputs=None,
+        ):
+            # First thing first, enable eager execution if necessary.
+            base.enable_eager_execution_if_necessary()
+            # Initialize base class.
+            base.__init__(
+                self,
+                observation_space,
+                action_space,
+                config,
+                existing_inputs=existing_inputs,
+                existing_model=existing_model,
+            )
+            ValueNetworkMixin.__init__(self, config)
+            PostprocessAdvantages.__init__(self)
+            # Not needed for pure BC.
+            if config["beta"] != 0.0:
+                # Set up a tf-var for the moving avg (do this here to make it work
+                # with eager mode); "c^2" in the paper.
+                self._moving_average_sqd_adv_norm = get_variable(
+                    config["moving_average_sqd_adv_norm_start"],
+                    framework="tf",
+                    tf_name="moving_average_of_advantage_norm",
+                    trainable=False,
+                )
+            # Note: this is a bit ugly, but loss and optimizer initialization must
+            # happen after all the MixIns are initialized.
+            self.maybe_initialize_optimizer_and_loss()
+        @override(base)
+        def loss(
+            self,
+            model: Union[ModelV2, "tf.keras.Model"],
+            dist_class: Type[TFActionDistribution],
+            train_batch: SampleBatch,
+        ) -> Union[TensorType, List[TensorType]]:
+            model_out, _ = model(train_batch)
+            action_dist = dist_class(model_out, model)
+            value_estimates = model.value_function()
+            self._marwil_loss = MARWILLoss(
+                self,
+                value_estimates,
+                action_dist,
+                train_batch,
+                self.config["vf_coeff"],
+                self.config["beta"],
+            )
+            return self._marwil_loss.total_loss
+        @override(base)
+        def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
+            stats = {
+                "policy_loss": self._marwil_loss.p_loss,
+                "total_loss": self._marwil_loss.total_loss,
+            }
+            if self.config["beta"] != 0.0:
+                stats["moving_average_sqd_adv_norm"] = self._moving_average_sqd_adv_norm
+                stats["vf_explained_var"] = self._marwil_loss.explained_variance
+                stats["vf_loss"] = self._marwil_loss.v_loss
+            return stats
+        @override(base)
+        def compute_gradients_fn(
+            self, optimizer: LocalOptimizer, loss: TensorType
+        ) -> ModelGradients:
+            return compute_gradients(self, optimizer, loss)
+    MARWILTFPolicy.__name__ = name
+    MARWILTFPolicy.__qualname__ = name
+    return MARWILTFPolicy
+MARWILTF1Policy = get_marwil_tf_policy("MARWILTF1Policy", DynamicTFPolicyV2)
+MARWILTF2Policy = get_marwil_tf_policy("MARWILTF2Policy", EagerTFPolicyV2)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/marwil_torch_policy.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from typing import Dict, List, Type, Union
+from ray.rllib.algorithms.marwil.marwil_tf_policy import PostprocessAdvantages
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_mixins import ValueNetworkMixin
+from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.torch_utils import apply_grad_clipping, explained_variance
+from ray.rllib.utils.typing import TensorType
+torch, _ = try_import_torch()
+class MARWILTorchPolicy(ValueNetworkMixin, PostprocessAdvantages, TorchPolicyV2):
+    """PyTorch policy class used with Marwil."""
+    def __init__(self, observation_space, action_space, config):
+        TorchPolicyV2.__init__(
+            self,
+            observation_space,
+            action_space,
+            config,
+            max_seq_len=config["model"]["max_seq_len"],
+        )
+        ValueNetworkMixin.__init__(self, config)
+        PostprocessAdvantages.__init__(self)
+        # Not needed for pure BC.
+        if config["beta"] != 0.0:
+            # Set up a torch-var for the squared moving avg. advantage norm.
+            self._moving_average_sqd_adv_norm = torch.tensor(
+                [config["moving_average_sqd_adv_norm_start"]],
+                dtype=torch.float32,
+                requires_grad=False,
+            ).to(self.device)
+        # TODO: Don't require users to call this manually.
+        self._initialize_loss_from_dummy_batch()
+    @override(TorchPolicyV2)
+    def loss(
+        self,
+        model: ModelV2,
+        dist_class: Type[TorchDistributionWrapper],
+        train_batch: SampleBatch,
+    ) -> Union[TensorType, List[TensorType]]:
+        model_out, _ = model(train_batch)
+        action_dist = dist_class(model_out, model)
+        actions = train_batch[SampleBatch.ACTIONS]
+        # log\pi_\theta(a|s)
+        logprobs = action_dist.logp(actions)
+        # Advantage estimation.
+        if self.config["beta"] != 0.0:
+            cumulative_rewards = train_batch[Postprocessing.ADVANTAGES]
+            state_values = model.value_function()
+            adv = cumulative_rewards - state_values
+            adv_squared_mean = torch.mean(torch.pow(adv, 2.0))
+            explained_var = explained_variance(cumulative_rewards, state_values)
+            ev = torch.mean(explained_var)
+            model.tower_stats["explained_variance"] = ev
+            # Policy loss.
+            # Update averaged advantage norm.
+            rate = self.config["moving_average_sqd_adv_norm_update_rate"]
+            self._moving_average_sqd_adv_norm = (
+                rate * (adv_squared_mean.detach() - self._moving_average_sqd_adv_norm)
+                + self._moving_average_sqd_adv_norm
+            )
+            model.tower_stats[
+                "_moving_average_sqd_adv_norm"
+            ] = self._moving_average_sqd_adv_norm
+            # Exponentially weighted advantages.
+            exp_advs = torch.exp(
+                self.config["beta"]
+                * (adv / (1e-8 + torch.pow(self._moving_average_sqd_adv_norm, 0.5)))
+            ).detach()
+            # Value loss.
+            v_loss = 0.5 * adv_squared_mean
+        else:
+            # Policy loss (simple BC loss term).
+            exp_advs = 1.0
+            # Value loss.
+            v_loss = 0.0
+        model.tower_stats["v_loss"] = v_loss
+        # logprob loss alone tends to push action distributions to
+        # have very low entropy, resulting in worse performance for
+        # unfamiliar situations.
+        # A scaled logstd loss term encourages stochasticity, thus
+        # alleviate the problem to some extent.
+        logstd_coeff = self.config["bc_logstd_coeff"]
+        if logstd_coeff > 0.0:
+            logstds = torch.mean(action_dist.log_std, dim=1)
+        else:
+            logstds = 0.0
+        p_loss = -torch.mean(exp_advs * (logprobs + logstd_coeff * logstds))
+        model.tower_stats["p_loss"] = p_loss
+        # Combine both losses.
+        self.v_loss = v_loss
+        self.p_loss = p_loss
+        total_loss = p_loss + self.config["vf_coeff"] * v_loss
+        model.tower_stats["total_loss"] = total_loss
+        return total_loss
+    @override(TorchPolicyV2)
+    def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
+        stats = {
+            "policy_loss": self.get_tower_stats("p_loss")[0].item(),
+            "total_loss": self.get_tower_stats("total_loss")[0].item(),
+        }
+        if self.config["beta"] != 0.0:
+            stats["moving_average_sqd_adv_norm"] = self.get_tower_stats(
+                "_moving_average_sqd_adv_norm"
+            )[0].item()
+            stats["vf_explained_var"] = self.get_tower_stats("explained_variance")[
+                0
+            ].item()
+            stats["vf_loss"] = self.get_tower_stats("v_loss")[0].item()
+        return convert_to_numpy(stats)
+    def extra_grad_process(
+        self, optimizer: "torch.optim.Optimizer", loss: TensorType
+    ) -> Dict[str, TensorType]:
+        return apply_grad_clipping(self, optimizer, loss)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/torch/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/torch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (206 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/marwil/torch/__pycache__/marwil_torch_learner.cpython-311.pyc ADDED Viewed

Binary file (5.42 kB). View file