koichi12 commited on Feb 12, 2025

Commit

5f20f96

verified ·

1 Parent(s): 30f24c0

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__init__.py +12 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_tf_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_torch_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/default_appo_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.py +434 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py +147 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_rl_module.py +11 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_tf_policy.py +393 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_torch_policy.py +412 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/default_appo_rl_module.py +59 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/default_appo_torch_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_learner.py +234 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_rl_module.py +13 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/default_appo_torch_rl_module.py +10 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/utils.py +133 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_catalog.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/actor_network.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/critic_network.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/disagree_networks.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/dreamer_model.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/world_model.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/cnn_atari.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/continue_predictor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/conv_transpose_atari.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/dynamics_predictor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/mlp.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/representation_layer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor_layer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/sequence_model.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/vector_decoder.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py +94 -0

.gitattributes CHANGED Viewed

@@ -175,3 +175,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebb34d8a5e73fa6657fb50dde3c5afc10ca55bef89431f9fbe15555295f4da0e
+size 168124

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from ray.rllib.algorithms.appo.appo import APPO, APPOConfig
+from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy, APPOTF2Policy
+from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy
+__all__ = [
+    "APPO",
+    "APPOConfig",
+    # @OldAPIStack
+    "APPOTF1Policy",
+    "APPOTF2Policy",
+    "APPOTorchPolicy",
+]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (580 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo.cpython-311.pyc ADDED Viewed

Binary file (18.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_learner.cpython-311.pyc ADDED Viewed

Binary file (8.25 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_rl_module.cpython-311.pyc ADDED Viewed

Binary file (637 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_tf_policy.cpython-311.pyc ADDED Viewed

Binary file (17 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_torch_policy.cpython-311.pyc ADDED Viewed

Binary file (19.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/default_appo_rl_module.cpython-311.pyc ADDED Viewed

Binary file (3.85 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (5.31 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.py ADDED Viewed

	@@ -0,0 +1,434 @@

+"""Asynchronous Proximal Policy Optimization (APPO)
+The algorithm is described in [1] (under the name of "IMPACT"):
+Detailed documentation:
+https://docs.ray.io/en/master/rllib-algorithms.html#appo
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
+"""
+from typing import Optional, Type
+import logging
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.algorithms.impala.impala import IMPALA, IMPALAConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.policy.policy import Policy
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning
+from ray.rllib.utils.metrics import (
+    LAST_TARGET_UPDATE_TS,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_TARGET_UPDATES,
+)
+from ray.rllib.utils.metrics import LEARNER_STATS_KEY
+logger = logging.getLogger(__name__)
+LEARNER_RESULTS_KL_KEY = "mean_kl_loss"
+LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff"
+OLD_ACTION_DIST_KEY = "old_action_dist"
+class APPOConfig(IMPALAConfig):
+    """Defines a configuration class from which an APPO Algorithm can be built.
+    .. testcode::
+        from ray.rllib.algorithms.appo import APPOConfig
+        config = (
+            APPOConfig()
+            .training(lr=0.01, grad_clip=30.0, train_batch_size_per_learner=50)
+        )
+        config = config.learners(num_learners=1)
+        config = config.env_runners(num_env_runners=1)
+        config = config.environment("CartPole-v1")
+        # Build an Algorithm object from the config and run 1 training iteration.
+        algo = config.build()
+        algo.train()
+        del algo
+    .. testcode::
+        from ray.rllib.algorithms.appo import APPOConfig
+        from ray import air
+        from ray import tune
+        config = APPOConfig()
+        # Update the config object.
+        config = config.training(lr=tune.grid_search([0.001,]))
+        # Set the config object's env.
+        config = config.environment(env="CartPole-v1")
+        # Use to_dict() to get the old-style python config dict when running with tune.
+        tune.Tuner(
+            "APPO",
+            run_config=air.RunConfig(
+                stop={"training_iteration": 1},
+                verbose=0,
+            ),
+            param_space=config.to_dict(),
+        ).fit()
+    .. testoutput::
+        :hide:
+        ...
+    """
+    def __init__(self, algo_class=None):
+        """Initializes a APPOConfig instance."""
+        self.exploration_config = {
+            # The Exploration class to use. In the simplest case, this is the name
+            # (str) of any class present in the `rllib.utils.exploration` package.
+            # You can also provide the python class directly or the full location
+            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
+            # EpsilonGreedy").
+            "type": "StochasticSampling",
+            # Add constructor kwargs here (if any).
+        }
+        super().__init__(algo_class=algo_class or APPO)
+        # fmt: off
+        # __sphinx_doc_begin__
+        # APPO specific settings:
+        self.vtrace = True
+        self.use_gae = True
+        self.lambda_ = 1.0
+        self.clip_param = 0.4
+        self.use_kl_loss = False
+        self.kl_coeff = 1.0
+        self.kl_target = 0.01
+        self.target_worker_clipping = 2.0
+        # Circular replay buffer settings.
+        # Used in [1] for discrete action tasks:
+        # `circular_buffer_num_batches=4` and `circular_buffer_iterations_per_batch=2`
+        # For cont. action tasks:
+        # `circular_buffer_num_batches=16` and `circular_buffer_iterations_per_batch=20`
+        self.circular_buffer_num_batches = 4
+        self.circular_buffer_iterations_per_batch = 2
+        # Override some of IMPALAConfig's default values with APPO-specific values.
+        self.num_env_runners = 2
+        self.target_network_update_freq = 2
+        self.broadcast_interval = 1
+        self.grad_clip = 40.0
+        # Note: Only when using enable_rl_module_and_learner=True can the clipping mode
+        # be configured by the user. On the old API stack, RLlib will always clip by
+        # global_norm, no matter the value of `grad_clip_by`.
+        self.grad_clip_by = "global_norm"
+        self.opt_type = "adam"
+        self.lr = 0.0005
+        self.decay = 0.99
+        self.momentum = 0.0
+        self.epsilon = 0.1
+        self.vf_loss_coeff = 0.5
+        self.entropy_coeff = 0.01
+        self.tau = 1.0
+        # __sphinx_doc_end__
+        # fmt: on
+        self.lr_schedule = None  # @OldAPIStack
+        self.entropy_coeff_schedule = None  # @OldAPIStack
+        self.num_gpus = 0  # @OldAPIStack
+        self.num_multi_gpu_tower_stacks = 1  # @OldAPIStack
+        self.minibatch_buffer_size = 1  # @OldAPIStack
+        self.replay_proportion = 0.0  # @OldAPIStack
+        self.replay_buffer_num_slots = 100  # @OldAPIStack
+        self.learner_queue_size = 16  # @OldAPIStack
+        self.learner_queue_timeout = 300  # @OldAPIStack
+        # Deprecated keys.
+        self.target_update_frequency = DEPRECATED_VALUE
+        self.use_critic = DEPRECATED_VALUE
+    @override(IMPALAConfig)
+    def training(
+        self,
+        *,
+        vtrace: Optional[bool] = NotProvided,
+        use_gae: Optional[bool] = NotProvided,
+        lambda_: Optional[float] = NotProvided,
+        clip_param: Optional[float] = NotProvided,
+        use_kl_loss: Optional[bool] = NotProvided,
+        kl_coeff: Optional[float] = NotProvided,
+        kl_target: Optional[float] = NotProvided,
+        target_network_update_freq: Optional[int] = NotProvided,
+        tau: Optional[float] = NotProvided,
+        target_worker_clipping: Optional[float] = NotProvided,
+        circular_buffer_num_batches: Optional[int] = NotProvided,
+        circular_buffer_iterations_per_batch: Optional[int] = NotProvided,
+        # Deprecated keys.
+        target_update_frequency=DEPRECATED_VALUE,
+        use_critic=DEPRECATED_VALUE,
+        **kwargs,
+    ) -> "APPOConfig":
+        """Sets the training related configuration.
+        Args:
+            vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE
+                advantages will be used instead.
+            use_gae: If true, use the Generalized Advantage Estimator (GAE)
+                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
+                Only applies if vtrace=False.
+            lambda_: GAE (lambda) parameter.
+            clip_param: PPO surrogate slipping parameter.
+            use_kl_loss: Whether to use the KL-term in the loss function.
+            kl_coeff: Coefficient for weighting the KL-loss term.
+            kl_target: Target term for the KL-term to reach (via adjusting the
+                `kl_coeff` automatically).
+            target_network_update_freq: NOTE: This parameter is only applicable on
+                the new API stack. The frequency with which to update the target
+                policy network from the main trained policy network. The metric
+                used is `NUM_ENV_STEPS_TRAINED_LIFETIME` and the unit is `n` (see [1]
+                4.1.1), where: `n = [circular_buffer_num_batches (N)] *
+                [circular_buffer_iterations_per_batch (K)] * [train batch size]`
+                For example, if you set `target_network_update_freq=2`, and N=4, K=2,
+                and `train_batch_size_per_learner=500`, then the target net is updated
+                every 2*4*2*500=8000 trained env steps (every 16 batch updates on each
+                learner).
+                The authors in [1] suggests that this setting is robust to a range of
+                choices (try values between 0.125 and 4).
+            target_network_update_freq: The frequency to update the target policy and
+                tune the kl loss coefficients that are used during training. After
+                setting this parameter, the algorithm waits for at least
+                `target_network_update_freq` number of environment samples to be trained
+                on before updating the target networks and tune the kl loss
+                coefficients. NOTE: This parameter is only applicable when using the
+                Learner API (enable_rl_module_and_learner=True).
+            tau: The factor by which to update the target policy network towards
+                the current policy network. Can range between 0 and 1.
+                e.g. updated_param = tau * current_param + (1 - tau) * target_param
+            target_worker_clipping: The maximum value for the target-worker-clipping
+                used for computing the IS ratio, described in [1]
+                IS = min(π(i) / π(target), ρ) * (π / π(i))
+            circular_buffer_num_batches: The number of train batches that fit
+                into the circular buffer. Each such train batch can be sampled for
+                training max. `circular_buffer_iterations_per_batch` times.
+            circular_buffer_iterations_per_batch: The number of times any train
+                batch in the circular buffer can be sampled for training. A batch gets
+                evicted from the buffer either if it's the oldest batch in the buffer
+                and a new batch is added OR if the batch reaches this max. number of
+                being sampled.
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        if target_update_frequency != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="target_update_frequency",
+                new="target_network_update_freq",
+                error=True,
+            )
+        if use_critic != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="use_critic",
+                help="`use_critic` no longer supported! APPO always uses a value "
+                "function (critic).",
+                error=True,
+            )
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+        if vtrace is not NotProvided:
+            self.vtrace = vtrace
+        if use_gae is not NotProvided:
+            self.use_gae = use_gae
+        if lambda_ is not NotProvided:
+            self.lambda_ = lambda_
+        if clip_param is not NotProvided:
+            self.clip_param = clip_param
+        if use_kl_loss is not NotProvided:
+            self.use_kl_loss = use_kl_loss
+        if kl_coeff is not NotProvided:
+            self.kl_coeff = kl_coeff
+        if kl_target is not NotProvided:
+            self.kl_target = kl_target
+        if target_network_update_freq is not NotProvided:
+            self.target_network_update_freq = target_network_update_freq
+        if tau is not NotProvided:
+            self.tau = tau
+        if target_worker_clipping is not NotProvided:
+            self.target_worker_clipping = target_worker_clipping
+        if circular_buffer_num_batches is not NotProvided:
+            self.circular_buffer_num_batches = circular_buffer_num_batches
+        if circular_buffer_iterations_per_batch is not NotProvided:
+            self.circular_buffer_iterations_per_batch = (
+                circular_buffer_iterations_per_batch
+            )
+        return self
+    @override(IMPALAConfig)
+    def validate(self) -> None:
+        super().validate()
+        # On new API stack, circular buffer should be used, not `minibatch_buffer_size`.
+        if self.enable_rl_module_and_learner:
+            if self.minibatch_buffer_size != 1 or self.replay_proportion != 0.0:
+                self._value_error(
+                    "`minibatch_buffer_size/replay_proportion` not valid on new API "
+                    "stack with APPO! "
+                    "Use `circular_buffer_num_batches` for the number of train batches "
+                    "in the circular buffer. To change the maximum number of times "
+                    "any batch may be sampled, set "
+                    "`circular_buffer_iterations_per_batch`."
+                )
+            if self.num_multi_gpu_tower_stacks != 1:
+                self._value_error(
+                    "`num_multi_gpu_tower_stacks` not supported on new API stack with "
+                    "APPO! In order to train on multi-GPU, use "
+                    "`config.learners(num_learners=[number of GPUs], "
+                    "num_gpus_per_learner=1)`. To scale the throughput of batch-to-GPU-"
+                    "pre-loading on each of your `Learners`, set "
+                    "`num_gpu_loader_threads` to a higher number (recommended values: "
+                    "1-8)."
+                )
+            if self.learner_queue_size != 16:
+                self._value_error(
+                    "`learner_queue_size` not supported on new API stack with "
+                    "APPO! In order set the size of the circular buffer (which acts as "
+                    "a 'learner queue'), use "
+                    "`config.training(circular_buffer_num_batches=..)`. To change the "
+                    "maximum number of times any batch may be sampled, set "
+                    "`config.training(circular_buffer_iterations_per_batch=..)`."
+                )
+    @override(IMPALAConfig)
+    def get_default_learner_class(self):
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.appo.torch.appo_torch_learner import (
+                APPOTorchLearner,
+            )
+            return APPOTorchLearner
+        elif self.framework_str in ["tf2", "tf"]:
+            raise ValueError(
+                "TensorFlow is no longer supported on the new API stack! "
+                "Use `framework='torch'`."
+            )
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use `framework='torch'`."
+            )
+    @override(IMPALAConfig)
+    def get_default_rl_module_spec(self) -> RLModuleSpec:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.appo.torch.appo_torch_rl_module import (
+                APPOTorchRLModule as RLModule,
+            )
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use either 'torch' or 'tf2'."
+            )
+        return RLModuleSpec(module_class=RLModule)
+    @property
+    @override(AlgorithmConfig)
+    def _model_config_auto_includes(self):
+        return super()._model_config_auto_includes | {"vf_share_layers": False}
+class APPO(IMPALA):
+    def __init__(self, config, *args, **kwargs):
+        """Initializes an APPO instance."""
+        super().__init__(config, *args, **kwargs)
+        # After init: Initialize target net.
+        # TODO(avnishn): Does this need to happen in __init__? I think we can move it
+        #  to setup()
+        if not self.config.enable_rl_module_and_learner:
+            self.env_runner.foreach_policy_to_train(lambda p, _: p.update_target())
+    @override(IMPALA)
+    def training_step(self) -> None:
+        if self.config.enable_rl_module_and_learner:
+            return super().training_step()
+        train_results = super().training_step()
+        # Update the target network and the KL coefficient for the APPO-loss.
+        # The target network update frequency is calculated automatically by the product
+        # of `num_epochs` setting (usually 1 for APPO) and `minibatch_buffer_size`.
+        last_update = self._counters[LAST_TARGET_UPDATE_TS]
+        cur_ts = self._counters[
+            (
+                NUM_AGENT_STEPS_SAMPLED
+                if self.config.count_steps_by == "agent_steps"
+                else NUM_ENV_STEPS_SAMPLED
+            )
+        ]
+        target_update_freq = self.config.num_epochs * self.config.minibatch_buffer_size
+        if cur_ts - last_update > target_update_freq:
+            self._counters[NUM_TARGET_UPDATES] += 1
+            self._counters[LAST_TARGET_UPDATE_TS] = cur_ts
+            # Update our target network.
+            self.env_runner.foreach_policy_to_train(lambda p, _: p.update_target())
+            # Also update the KL-coefficient for the APPO loss, if necessary.
+            if self.config.use_kl_loss:
+                def update(pi, pi_id):
+                    assert LEARNER_STATS_KEY not in train_results, (
+                        "{} should be nested under policy id key".format(
+                            LEARNER_STATS_KEY
+                        ),
+                        train_results,
+                    )
+                    if pi_id in train_results:
+                        kl = train_results[pi_id][LEARNER_STATS_KEY].get("kl")
+                        assert kl is not None, (train_results, pi_id)
+                        # Make the actual `Policy.update_kl()` call.
+                        pi.update_kl(kl)
+                    else:
+                        logger.warning("No data for {}, not updating kl".format(pi_id))
+                # Update KL on all trainable policies within the local (trainer)
+                # Worker.
+                self.env_runner.foreach_policy_to_train(update)
+        return train_results
+    @classmethod
+    @override(IMPALA)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return APPOConfig()
+    @classmethod
+    @override(IMPALA)
+    def get_default_policy_class(
+        cls, config: AlgorithmConfig
+    ) -> Optional[Type[Policy]]:
+        if config["framework"] == "torch":
+            from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy
+            return APPOTorchPolicy
+        elif config["framework"] == "tf":
+            if config.enable_rl_module_and_learner:
+                raise ValueError(
+                    "RLlib's RLModule and Learner API is not supported for"
+                    " tf1. Use "
+                    "framework='tf2' instead."
+                )
+            from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy
+            return APPOTF1Policy
+        else:
+            from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF2Policy
+            return APPOTF2Policy

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import abc
+from typing import Any, Dict, Optional
+from ray.rllib.algorithms.appo.appo import APPOConfig
+from ray.rllib.algorithms.appo.utils import CircularBuffer
+from ray.rllib.algorithms.impala.impala_learner import IMPALALearner
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.learner.utils import update_target_network
+from ray.rllib.core.rl_module.apis import TargetNetworkAPI, ValueFunctionAPI
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict
+from ray.rllib.utils.metrics import (
+    LAST_TARGET_UPDATE_TS,
+    NUM_ENV_STEPS_TRAINED_LIFETIME,
+    NUM_MODULE_STEPS_TRAINED,
+    NUM_TARGET_UPDATES,
+)
+from ray.rllib.utils.schedules.scheduler import Scheduler
+from ray.rllib.utils.typing import ModuleID, ShouldModuleBeUpdatedFn
+class APPOLearner(IMPALALearner):
+    """Adds KL coeff updates via `after_gradient_based_update()` to IMPALA logic.
+    Framework-specific subclasses must override `_update_module_kl_coeff()`.
+    """
+    @override(IMPALALearner)
+    def build(self):
+        self._learner_thread_in_queue = CircularBuffer(
+            num_batches=self.config.circular_buffer_num_batches,
+            iterations_per_batch=self.config.circular_buffer_iterations_per_batch,
+        )
+        super().build()
+        # Make target networks.
+        self.module.foreach_module(
+            lambda mid, mod: (
+                mod.make_target_networks()
+                if isinstance(mod, TargetNetworkAPI)
+                else None
+            )
+        )
+        # The current kl coefficients per module as (framework specific) tensor
+        # variables.
+        self.curr_kl_coeffs_per_module: LambdaDefaultDict[
+            ModuleID, Scheduler
+        ] = LambdaDefaultDict(
+            lambda module_id: self._get_tensor_variable(
+                self.config.get_config_for_module(module_id).kl_coeff
+            )
+        )
+    @override(Learner)
+    def add_module(
+        self,
+        *,
+        module_id: ModuleID,
+        module_spec: RLModuleSpec,
+        config_overrides: Optional[Dict] = None,
+        new_should_module_be_updated: Optional[ShouldModuleBeUpdatedFn] = None,
+    ) -> MultiRLModuleSpec:
+        marl_spec = super().add_module(
+            module_id=module_id,
+            module_spec=module_spec,
+            config_overrides=config_overrides,
+            new_should_module_be_updated=new_should_module_be_updated,
+        )
+        # Create target networks for added Module, if applicable.
+        if isinstance(self.module[module_id].unwrapped(), TargetNetworkAPI):
+            self.module[module_id].unwrapped().make_target_networks()
+        return marl_spec
+    @override(IMPALALearner)
+    def remove_module(self, module_id: str) -> MultiRLModuleSpec:
+        marl_spec = super().remove_module(module_id)
+        self.curr_kl_coeffs_per_module.pop(module_id)
+        return marl_spec
+    @override(Learner)
+    def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None:
+        """Updates the target Q Networks."""
+        super().after_gradient_based_update(timesteps=timesteps)
+        # TODO (sven): Maybe we should have a `after_gradient_based_update`
+        #  method per module?
+        curr_timestep = timesteps.get(NUM_ENV_STEPS_TRAINED_LIFETIME, 0)
+        for module_id, module in self.module._rl_modules.items():
+            config = self.config.get_config_for_module(module_id)
+            last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS)
+            if isinstance(module.unwrapped(), TargetNetworkAPI) and (
+                curr_timestep - self.metrics.peek(last_update_ts_key, default=0)
+                >= (
+                    config.target_network_update_freq
+                    * config.circular_buffer_num_batches
+                    * config.circular_buffer_iterations_per_batch
+                    * config.train_batch_size_per_learner
+                )
+            ):
+                for (
+                    main_net,
+                    target_net,
+                ) in module.unwrapped().get_target_network_pairs():
+                    update_target_network(
+                        main_net=main_net,
+                        target_net=target_net,
+                        tau=config.tau,
+                    )
+                # Increase lifetime target network update counter by one.
+                self.metrics.log_value((module_id, NUM_TARGET_UPDATES), 1, reduce="sum")
+                # Update the (single-value -> window=1) last updated timestep metric.
+                self.metrics.log_value(last_update_ts_key, curr_timestep, window=1)
+            if (
+                config.use_kl_loss
+                and self.metrics.peek((module_id, NUM_MODULE_STEPS_TRAINED), default=0)
+                > 0
+            ):
+                self._update_module_kl_coeff(module_id=module_id, config=config)
+    @classmethod
+    @override(Learner)
+    def rl_module_required_apis(cls) -> list[type]:
+        # In order for a PPOLearner to update an RLModule, it must implement the
+        # following APIs:
+        return [TargetNetworkAPI, ValueFunctionAPI]
+    @abc.abstractmethod
+    def _update_module_kl_coeff(self, module_id: ModuleID, config: APPOConfig) -> None:
+        """Dynamically update the KL loss coefficients of each module.
+        The update is completed using the mean KL divergence between the action
+        distributions current policy and old policy of each module. That action
+        distribution is computed during the most recent update/call to `compute_loss`.
+        Args:
+            module_id: The module whose KL loss coefficient to update.
+            config: The AlgorithmConfig specific to the given `module_id`.
+        """
+AppoLearner = APPOLearner

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_rl_module.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Backward compat import.
+from ray.rllib.algorithms.appo.default_appo_rl_module import (  # noqa
+    DefaultAPPORLModule as APPORLModule,
+)
+from ray.rllib.utils.deprecation import deprecation_warning
+deprecation_warning(
+    old="ray.rllib.algorithms.appo.appo_rl_module.APPORLModule",
+    new="ray.rllib.algorithms.appo.default_appo_rl_module.DefaultAPPORLModule",
+    error=False,
+)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_tf_policy.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+TensorFlow policy class used for APPO.
+Adapted from VTraceTFPolicy to use the PPO surrogate loss.
+Keep in sync with changes to VTraceTFPolicy.
+"""
+import numpy as np
+import logging
+import gymnasium as gym
+from typing import Dict, List, Optional, Type, Union
+from ray.rllib.algorithms.appo.utils import make_appo_models
+from ray.rllib.algorithms.impala import vtrace_tf as vtrace
+from ray.rllib.algorithms.impala.impala_tf_policy import (
+    _make_time_major,
+    VTraceClipGradients,
+    VTraceOptimizer,
+)
+from ray.rllib.evaluation.postprocessing import (
+    compute_bootstrap_value,
+    compute_gae_for_sample_batch,
+    Postprocessing,
+)
+from ray.rllib.models.tf.tf_action_dist import Categorical
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2
+from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
+from ray.rllib.policy.tf_mixins import (
+    EntropyCoeffSchedule,
+    LearningRateSchedule,
+    KLCoeffMixin,
+    ValueNetworkMixin,
+    GradStatsMixin,
+    TargetNetworkMixin,
+)
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_action_dist import TFActionDistribution
+from ray.rllib.utils.annotations import (
+    override,
+)
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.tf_utils import explained_variance
+from ray.rllib.utils.typing import TensorType
+tf1, tf, tfv = try_import_tf()
+logger = logging.getLogger(__name__)
+# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
+def get_appo_tf_policy(name: str, base: type) -> type:
+    """Construct an APPOTFPolicy inheriting either dynamic or eager base policies.
+    Args:
+        base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2.
+    Returns:
+        A TF Policy to be used with Impala.
+    """
+    class APPOTFPolicy(
+        VTraceClipGradients,
+        VTraceOptimizer,
+        LearningRateSchedule,
+        KLCoeffMixin,
+        EntropyCoeffSchedule,
+        ValueNetworkMixin,
+        TargetNetworkMixin,
+        GradStatsMixin,
+        base,
+    ):
+        def __init__(
+            self,
+            observation_space,
+            action_space,
+            config,
+            existing_model=None,
+            existing_inputs=None,
+        ):
+            # First thing first, enable eager execution if necessary.
+            base.enable_eager_execution_if_necessary()
+            # Although this is a no-op, we call __init__ here to make it clear
+            # that base.__init__ will use the make_model() call.
+            VTraceClipGradients.__init__(self)
+            VTraceOptimizer.__init__(self)
+            # Initialize base class.
+            base.__init__(
+                self,
+                observation_space,
+                action_space,
+                config,
+                existing_inputs=existing_inputs,
+                existing_model=existing_model,
+            )
+            # TF LearningRateSchedule depends on self.framework, so initialize
+            # after base.__init__() is called.
+            LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])
+            EntropyCoeffSchedule.__init__(
+                self, config["entropy_coeff"], config["entropy_coeff_schedule"]
+            )
+            ValueNetworkMixin.__init__(self, config)
+            KLCoeffMixin.__init__(self, config)
+            GradStatsMixin.__init__(self)
+            # Note: this is a bit ugly, but loss and optimizer initialization must
+            # happen after all the MixIns are initialized.
+            self.maybe_initialize_optimizer_and_loss()
+            # Initiate TargetNetwork ops after loss initialization.
+            TargetNetworkMixin.__init__(self)
+        @override(base)
+        def make_model(self) -> ModelV2:
+            return make_appo_models(self)
+        @override(base)
+        def loss(
+            self,
+            model: Union[ModelV2, "tf.keras.Model"],
+            dist_class: Type[TFActionDistribution],
+            train_batch: SampleBatch,
+        ) -> Union[TensorType, List[TensorType]]:
+            model_out, _ = model(train_batch)
+            action_dist = dist_class(model_out, model)
+            if isinstance(self.action_space, gym.spaces.Discrete):
+                is_multidiscrete = False
+                output_hidden_shape = [self.action_space.n]
+            elif isinstance(self.action_space, gym.spaces.multi_discrete.MultiDiscrete):
+                is_multidiscrete = True
+                output_hidden_shape = self.action_space.nvec.astype(np.int32)
+            else:
+                is_multidiscrete = False
+                output_hidden_shape = 1
+            def make_time_major(*args, **kw):
+                return _make_time_major(
+                    self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw
+                )
+            actions = train_batch[SampleBatch.ACTIONS]
+            dones = train_batch[SampleBatch.TERMINATEDS]
+            rewards = train_batch[SampleBatch.REWARDS]
+            behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS]
+            target_model_out, _ = self.target_model(train_batch)
+            prev_action_dist = dist_class(behaviour_logits, self.model)
+            values = self.model.value_function()
+            values_time_major = make_time_major(values)
+            bootstrap_values_time_major = make_time_major(
+                train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            )
+            bootstrap_value = bootstrap_values_time_major[-1]
+            if self.is_recurrent():
+                max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
+                mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
+                mask = tf.reshape(mask, [-1])
+                mask = make_time_major(mask)
+                def reduce_mean_valid(t):
+                    return tf.reduce_mean(tf.boolean_mask(t, mask))
+            else:
+                reduce_mean_valid = tf.reduce_mean
+            if self.config["vtrace"]:
+                logger.debug("Using V-Trace surrogate loss (vtrace=True)")
+                # Prepare actions for loss.
+                loss_actions = (
+                    actions if is_multidiscrete else tf.expand_dims(actions, axis=1)
+                )
+                old_policy_behaviour_logits = tf.stop_gradient(target_model_out)
+                old_policy_action_dist = dist_class(old_policy_behaviour_logits, model)
+                # Prepare KL for Loss
+                mean_kl = make_time_major(old_policy_action_dist.multi_kl(action_dist))
+                unpacked_behaviour_logits = tf.split(
+                    behaviour_logits, output_hidden_shape, axis=1
+                )
+                unpacked_old_policy_behaviour_logits = tf.split(
+                    old_policy_behaviour_logits, output_hidden_shape, axis=1
+                )
+                # Compute vtrace on the CPU for better perf.
+                with tf.device("/cpu:0"):
+                    vtrace_returns = vtrace.multi_from_logits(
+                        behaviour_policy_logits=make_time_major(
+                            unpacked_behaviour_logits
+                        ),
+                        target_policy_logits=make_time_major(
+                            unpacked_old_policy_behaviour_logits
+                        ),
+                        actions=tf.unstack(make_time_major(loss_actions), axis=2),
+                        discounts=tf.cast(
+                            ~make_time_major(tf.cast(dones, tf.bool)),
+                            tf.float32,
+                        )
+                        * self.config["gamma"],
+                        rewards=make_time_major(rewards),
+                        values=values_time_major,
+                        bootstrap_value=bootstrap_value,
+                        dist_class=Categorical if is_multidiscrete else dist_class,
+                        model=model,
+                        clip_rho_threshold=tf.cast(
+                            self.config["vtrace_clip_rho_threshold"], tf.float32
+                        ),
+                        clip_pg_rho_threshold=tf.cast(
+                            self.config["vtrace_clip_pg_rho_threshold"], tf.float32
+                        ),
+                    )
+                actions_logp = make_time_major(action_dist.logp(actions))
+                prev_actions_logp = make_time_major(prev_action_dist.logp(actions))
+                old_policy_actions_logp = make_time_major(
+                    old_policy_action_dist.logp(actions)
+                )
+                is_ratio = tf.clip_by_value(
+                    tf.math.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0
+                )
+                logp_ratio = is_ratio * tf.exp(actions_logp - prev_actions_logp)
+                self._is_ratio = is_ratio
+                advantages = vtrace_returns.pg_advantages
+                surrogate_loss = tf.minimum(
+                    advantages * logp_ratio,
+                    advantages
+                    * tf.clip_by_value(
+                        logp_ratio,
+                        1 - self.config["clip_param"],
+                        1 + self.config["clip_param"],
+                    ),
+                )
+                action_kl = (
+                    tf.reduce_mean(mean_kl, axis=0) if is_multidiscrete else mean_kl
+                )
+                mean_kl_loss = reduce_mean_valid(action_kl)
+                mean_policy_loss = -reduce_mean_valid(surrogate_loss)
+                # The value function loss.
+                value_targets = vtrace_returns.vs
+                delta = values_time_major - value_targets
+                mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
+                # The entropy loss.
+                actions_entropy = make_time_major(action_dist.multi_entropy())
+                mean_entropy = reduce_mean_valid(actions_entropy)
+            else:
+                logger.debug("Using PPO surrogate loss (vtrace=False)")
+                # Prepare KL for Loss
+                mean_kl = make_time_major(prev_action_dist.multi_kl(action_dist))
+                logp_ratio = tf.math.exp(
+                    make_time_major(action_dist.logp(actions))
+                    - make_time_major(prev_action_dist.logp(actions))
+                )
+                advantages = make_time_major(train_batch[Postprocessing.ADVANTAGES])
+                surrogate_loss = tf.minimum(
+                    advantages * logp_ratio,
+                    advantages
+                    * tf.clip_by_value(
+                        logp_ratio,
+                        1 - self.config["clip_param"],
+                        1 + self.config["clip_param"],
+                    ),
+                )
+                action_kl = (
+                    tf.reduce_mean(mean_kl, axis=0) if is_multidiscrete else mean_kl
+                )
+                mean_kl_loss = reduce_mean_valid(action_kl)
+                mean_policy_loss = -reduce_mean_valid(surrogate_loss)
+                # The value function loss.
+                value_targets = make_time_major(
+                    train_batch[Postprocessing.VALUE_TARGETS]
+                )
+                delta = values_time_major - value_targets
+                mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
+                # The entropy loss.
+                mean_entropy = reduce_mean_valid(
+                    make_time_major(action_dist.multi_entropy())
+                )
+            # The summed weighted loss.
+            total_loss = mean_policy_loss - mean_entropy * self.entropy_coeff
+            # Optional KL loss.
+            if self.config["use_kl_loss"]:
+                total_loss += self.kl_coeff * mean_kl_loss
+            # Optional vf loss (or in a separate term due to separate
+            # optimizers/networks).
+            loss_wo_vf = total_loss
+            if not self.config["_separate_vf_optimizer"]:
+                total_loss += mean_vf_loss * self.config["vf_loss_coeff"]
+            # Store stats in policy for stats_fn.
+            self._total_loss = total_loss
+            self._loss_wo_vf = loss_wo_vf
+            self._mean_policy_loss = mean_policy_loss
+            # Backward compatibility: Deprecate policy._mean_kl.
+            self._mean_kl_loss = self._mean_kl = mean_kl_loss
+            self._mean_vf_loss = mean_vf_loss
+            self._mean_entropy = mean_entropy
+            self._value_targets = value_targets
+            # Return one total loss or two losses: vf vs rest (policy + kl).
+            if self.config["_separate_vf_optimizer"]:
+                return loss_wo_vf, mean_vf_loss
+            else:
+                return total_loss
+        @override(base)
+        def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
+            values_batched = _make_time_major(
+                self,
+                train_batch.get(SampleBatch.SEQ_LENS),
+                self.model.value_function(),
+            )
+            stats_dict = {
+                "cur_lr": tf.cast(self.cur_lr, tf.float64),
+                "total_loss": self._total_loss,
+                "policy_loss": self._mean_policy_loss,
+                "entropy": self._mean_entropy,
+                "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables()),
+                "vf_loss": self._mean_vf_loss,
+                "vf_explained_var": explained_variance(
+                    tf.reshape(self._value_targets, [-1]),
+                    tf.reshape(values_batched, [-1]),
+                ),
+                "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64),
+            }
+            if self.config["vtrace"]:
+                is_stat_mean, is_stat_var = tf.nn.moments(self._is_ratio, [0, 1])
+                stats_dict["mean_IS"] = is_stat_mean
+                stats_dict["var_IS"] = is_stat_var
+            if self.config["use_kl_loss"]:
+                stats_dict["kl"] = self._mean_kl_loss
+                stats_dict["KL_Coeff"] = self.kl_coeff
+            return stats_dict
+        @override(base)
+        def postprocess_trajectory(
+            self,
+            sample_batch: SampleBatch,
+            other_agent_batches: Optional[SampleBatch] = None,
+            episode=None,
+        ):
+            # Call super's postprocess_trajectory first.
+            # sample_batch = super().postprocess_trajectory(
+            #    sample_batch, other_agent_batches, episode
+            # )
+            if not self.config["vtrace"]:
+                sample_batch = compute_gae_for_sample_batch(
+                    self, sample_batch, other_agent_batches, episode
+                )
+            else:
+                # Add the Columns.VALUES_BOOTSTRAPPED column, which we'll need
+                # inside the loss for vtrace calculations.
+                sample_batch = compute_bootstrap_value(sample_batch, self)
+            return sample_batch
+        @override(base)
+        def get_batch_divisibility_req(self) -> int:
+            return self.config["rollout_fragment_length"]
+    APPOTFPolicy.__name__ = name
+    APPOTFPolicy.__qualname__ = name
+    return APPOTFPolicy
+APPOTF1Policy = get_appo_tf_policy("APPOTF1Policy", DynamicTFPolicyV2)
+APPOTF2Policy = get_appo_tf_policy("APPOTF2Policy", EagerTFPolicyV2)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_torch_policy.py ADDED Viewed

	@@ -0,0 +1,412 @@

+"""
+PyTorch policy class used for APPO.
+Adapted from VTraceTFPolicy to use the PPO surrogate loss.
+Keep in sync with changes to VTraceTFPolicy.
+"""
+import gymnasium as gym
+import numpy as np
+import logging
+from typing import Any, Dict, List, Optional, Type, Union
+import ray
+from ray.rllib.algorithms.appo.utils import make_appo_models
+import ray.rllib.algorithms.impala.vtrace_torch as vtrace
+from ray.rllib.algorithms.impala.impala_torch_policy import (
+    make_time_major,
+    VTraceOptimizer,
+)
+from ray.rllib.evaluation.postprocessing import (
+    compute_bootstrap_value,
+    compute_gae_for_sample_batch,
+    Postprocessing,
+)
+from ray.rllib.models.action_dist import ActionDistribution
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.torch.torch_action_dist import (
+    TorchDistributionWrapper,
+    TorchCategorical,
+)
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_mixins import (
+    EntropyCoeffSchedule,
+    LearningRateSchedule,
+    KLCoeffMixin,
+    ValueNetworkMixin,
+    TargetNetworkMixin,
+)
+from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.torch_utils import (
+    apply_grad_clipping,
+    explained_variance,
+    global_norm,
+    sequence_mask,
+)
+from ray.rllib.utils.typing import TensorType
+torch, nn = try_import_torch()
+logger = logging.getLogger(__name__)
+# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs.
+class APPOTorchPolicy(
+    VTraceOptimizer,
+    LearningRateSchedule,
+    EntropyCoeffSchedule,
+    KLCoeffMixin,
+    ValueNetworkMixin,
+    TargetNetworkMixin,
+    TorchPolicyV2,
+):
+    """PyTorch policy class used with APPO."""
+    def __init__(self, observation_space, action_space, config):
+        config = dict(ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config)
+        config["enable_rl_module_and_learner"] = False
+        config["enable_env_runner_and_connector_v2"] = False
+        # Although this is a no-op, we call __init__ here to make it clear
+        # that base.__init__ will use the make_model() call.
+        VTraceOptimizer.__init__(self)
+        lr_schedule_additional_args = []
+        if config.get("_separate_vf_optimizer"):
+            lr_schedule_additional_args = (
+                [config["_lr_vf"][0][1], config["_lr_vf"]]
+                if isinstance(config["_lr_vf"], (list, tuple))
+                else [config["_lr_vf"], None]
+            )
+        LearningRateSchedule.__init__(
+            self, config["lr"], config["lr_schedule"], *lr_schedule_additional_args
+        )
+        TorchPolicyV2.__init__(
+            self,
+            observation_space,
+            action_space,
+            config,
+            max_seq_len=config["model"]["max_seq_len"],
+        )
+        EntropyCoeffSchedule.__init__(
+            self, config["entropy_coeff"], config["entropy_coeff_schedule"]
+        )
+        ValueNetworkMixin.__init__(self, config)
+        KLCoeffMixin.__init__(self, config)
+        self._initialize_loss_from_dummy_batch()
+        # Initiate TargetNetwork ops after loss initialization.
+        TargetNetworkMixin.__init__(self)
+    @override(TorchPolicyV2)
+    def init_view_requirements(self):
+        self.view_requirements = self._get_default_view_requirements()
+    @override(TorchPolicyV2)
+    def make_model(self) -> ModelV2:
+        return make_appo_models(self)
+    @override(TorchPolicyV2)
+    def loss(
+        self,
+        model: ModelV2,
+        dist_class: Type[ActionDistribution],
+        train_batch: SampleBatch,
+    ) -> Union[TensorType, List[TensorType]]:
+        """Constructs the loss for APPO.
+        With IS modifications and V-trace for Advantage Estimation.
+        Args:
+            model (ModelV2): The Model to calculate the loss for.
+            dist_class (Type[ActionDistribution]): The action distr. class.
+            train_batch: The training data.
+        Returns:
+            Union[TensorType, List[TensorType]]: A single loss tensor or a list
+                of loss tensors.
+        """
+        target_model = self.target_models[model]
+        model_out, _ = model(train_batch)
+        action_dist = dist_class(model_out, model)
+        if isinstance(self.action_space, gym.spaces.Discrete):
+            is_multidiscrete = False
+            output_hidden_shape = [self.action_space.n]
+        elif isinstance(self.action_space, gym.spaces.multi_discrete.MultiDiscrete):
+            is_multidiscrete = True
+            output_hidden_shape = self.action_space.nvec.astype(np.int32)
+        else:
+            is_multidiscrete = False
+            output_hidden_shape = 1
+        def _make_time_major(*args, **kwargs):
+            return make_time_major(
+                self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kwargs
+            )
+        actions = train_batch[SampleBatch.ACTIONS]
+        dones = train_batch[SampleBatch.TERMINATEDS]
+        rewards = train_batch[SampleBatch.REWARDS]
+        behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS]
+        target_model_out, _ = target_model(train_batch)
+        prev_action_dist = dist_class(behaviour_logits, model)
+        values = model.value_function()
+        values_time_major = _make_time_major(values)
+        bootstrap_values_time_major = _make_time_major(
+            train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
+        bootstrap_value = bootstrap_values_time_major[-1]
+        if self.is_recurrent():
+            max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
+            mask = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
+            mask = torch.reshape(mask, [-1])
+            mask = _make_time_major(mask)
+            num_valid = torch.sum(mask)
+            def reduce_mean_valid(t):
+                return torch.sum(t[mask]) / num_valid
+        else:
+            reduce_mean_valid = torch.mean
+        if self.config["vtrace"]:
+            logger.debug("Using V-Trace surrogate loss (vtrace=True)")
+            old_policy_behaviour_logits = target_model_out.detach()
+            old_policy_action_dist = dist_class(old_policy_behaviour_logits, model)
+            if isinstance(output_hidden_shape, (list, tuple, np.ndarray)):
+                unpacked_behaviour_logits = torch.split(
+                    behaviour_logits, list(output_hidden_shape), dim=1
+                )
+                unpacked_old_policy_behaviour_logits = torch.split(
+                    old_policy_behaviour_logits, list(output_hidden_shape), dim=1
+                )
+            else:
+                unpacked_behaviour_logits = torch.chunk(
+                    behaviour_logits, output_hidden_shape, dim=1
+                )
+                unpacked_old_policy_behaviour_logits = torch.chunk(
+                    old_policy_behaviour_logits, output_hidden_shape, dim=1
+                )
+            # Prepare actions for loss.
+            loss_actions = (
+                actions if is_multidiscrete else torch.unsqueeze(actions, dim=1)
+            )
+            # Prepare KL for loss.
+            action_kl = _make_time_major(old_policy_action_dist.kl(action_dist))
+            # Compute vtrace on the CPU for better perf.
+            vtrace_returns = vtrace.multi_from_logits(
+                behaviour_policy_logits=_make_time_major(unpacked_behaviour_logits),
+                target_policy_logits=_make_time_major(
+                    unpacked_old_policy_behaviour_logits
+                ),
+                actions=torch.unbind(_make_time_major(loss_actions), dim=2),
+                discounts=(1.0 - _make_time_major(dones).float())
+                * self.config["gamma"],
+                rewards=_make_time_major(rewards),
+                values=values_time_major,
+                bootstrap_value=bootstrap_value,
+                dist_class=TorchCategorical if is_multidiscrete else dist_class,
+                model=model,
+                clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
+                clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"],
+            )
+            actions_logp = _make_time_major(action_dist.logp(actions))
+            prev_actions_logp = _make_time_major(prev_action_dist.logp(actions))
+            old_policy_actions_logp = _make_time_major(
+                old_policy_action_dist.logp(actions)
+            )
+            is_ratio = torch.clamp(
+                torch.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0
+            )
+            logp_ratio = is_ratio * torch.exp(actions_logp - prev_actions_logp)
+            self._is_ratio = is_ratio
+            advantages = vtrace_returns.pg_advantages.to(logp_ratio.device)
+            surrogate_loss = torch.min(
+                advantages * logp_ratio,
+                advantages
+                * torch.clamp(
+                    logp_ratio,
+                    1 - self.config["clip_param"],
+                    1 + self.config["clip_param"],
+                ),
+            )
+            mean_kl_loss = reduce_mean_valid(action_kl)
+            mean_policy_loss = -reduce_mean_valid(surrogate_loss)
+            # The value function loss.
+            value_targets = vtrace_returns.vs.to(values_time_major.device)
+            delta = values_time_major - value_targets
+            mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
+            # The entropy loss.
+            mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy()))
+        else:
+            logger.debug("Using PPO surrogate loss (vtrace=False)")
+            # Prepare KL for Loss
+            action_kl = _make_time_major(prev_action_dist.kl(action_dist))
+            actions_logp = _make_time_major(action_dist.logp(actions))
+            prev_actions_logp = _make_time_major(prev_action_dist.logp(actions))
+            logp_ratio = torch.exp(actions_logp - prev_actions_logp)
+            advantages = _make_time_major(train_batch[Postprocessing.ADVANTAGES])
+            surrogate_loss = torch.min(
+                advantages * logp_ratio,
+                advantages
+                * torch.clamp(
+                    logp_ratio,
+                    1 - self.config["clip_param"],
+                    1 + self.config["clip_param"],
+                ),
+            )
+            mean_kl_loss = reduce_mean_valid(action_kl)
+            mean_policy_loss = -reduce_mean_valid(surrogate_loss)
+            # The value function loss.
+            value_targets = _make_time_major(train_batch[Postprocessing.VALUE_TARGETS])
+            delta = values_time_major - value_targets
+            mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
+            # The entropy loss.
+            mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy()))
+        # The summed weighted loss.
+        total_loss = mean_policy_loss - mean_entropy * self.entropy_coeff
+        # Optional additional KL Loss
+        if self.config["use_kl_loss"]:
+            total_loss += self.kl_coeff * mean_kl_loss
+        # Optional vf loss (or in a separate term due to separate
+        # optimizers/networks).
+        loss_wo_vf = total_loss
+        if not self.config["_separate_vf_optimizer"]:
+            total_loss += mean_vf_loss * self.config["vf_loss_coeff"]
+        # Store values for stats function in model (tower), such that for
+        # multi-GPU, we do not override them during the parallel loss phase.
+        model.tower_stats["total_loss"] = total_loss
+        model.tower_stats["mean_policy_loss"] = mean_policy_loss
+        model.tower_stats["mean_kl_loss"] = mean_kl_loss
+        model.tower_stats["mean_vf_loss"] = mean_vf_loss
+        model.tower_stats["mean_entropy"] = mean_entropy
+        model.tower_stats["value_targets"] = value_targets
+        model.tower_stats["vf_explained_var"] = explained_variance(
+            torch.reshape(value_targets, [-1]),
+            torch.reshape(values_time_major, [-1]),
+        )
+        # Return one total loss or two losses: vf vs rest (policy + kl).
+        if self.config["_separate_vf_optimizer"]:
+            return loss_wo_vf, mean_vf_loss
+        else:
+            return total_loss
+    @override(TorchPolicyV2)
+    def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
+        """Stats function for APPO. Returns a dict with important loss stats.
+        Args:
+            policy: The Policy to generate stats for.
+            train_batch: The SampleBatch (already) used for training.
+        Returns:
+            Dict[str, TensorType]: The stats dict.
+        """
+        stats_dict = {
+            "cur_lr": self.cur_lr,
+            "total_loss": torch.mean(torch.stack(self.get_tower_stats("total_loss"))),
+            "policy_loss": torch.mean(
+                torch.stack(self.get_tower_stats("mean_policy_loss"))
+            ),
+            "entropy": torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))),
+            "entropy_coeff": self.entropy_coeff,
+            "var_gnorm": global_norm(self.model.trainable_variables()),
+            "vf_loss": torch.mean(torch.stack(self.get_tower_stats("mean_vf_loss"))),
+            "vf_explained_var": torch.mean(
+                torch.stack(self.get_tower_stats("vf_explained_var"))
+            ),
+        }
+        if self.config["vtrace"]:
+            is_stat_mean = torch.mean(self._is_ratio, [0, 1])
+            is_stat_var = torch.var(self._is_ratio, [0, 1])
+            stats_dict["mean_IS"] = is_stat_mean
+            stats_dict["var_IS"] = is_stat_var
+        if self.config["use_kl_loss"]:
+            stats_dict["kl"] = torch.mean(
+                torch.stack(self.get_tower_stats("mean_kl_loss"))
+            )
+            stats_dict["KL_Coeff"] = self.kl_coeff
+        return convert_to_numpy(stats_dict)
+    @override(TorchPolicyV2)
+    def extra_action_out(
+        self,
+        input_dict: Dict[str, TensorType],
+        state_batches: List[TensorType],
+        model: TorchModelV2,
+        action_dist: TorchDistributionWrapper,
+    ) -> Dict[str, TensorType]:
+        return {SampleBatch.VF_PREDS: model.value_function()}
+    @override(TorchPolicyV2)
+    def postprocess_trajectory(
+        self,
+        sample_batch: SampleBatch,
+        other_agent_batches: Optional[Dict[Any, SampleBatch]] = None,
+        episode=None,
+    ):
+        # Call super's postprocess_trajectory first.
+        # sample_batch = super().postprocess_trajectory(
+        #    sample_batch, other_agent_batches, episode
+        # )
+        # Do all post-processing always with no_grad().
+        # Not using this here will introduce a memory leak
+        # in torch (issue #6962).
+        with torch.no_grad():
+            if not self.config["vtrace"]:
+                sample_batch = compute_gae_for_sample_batch(
+                    self, sample_batch, other_agent_batches, episode
+                )
+            else:
+                # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
+                # inside the loss for vtrace calculations.
+                sample_batch = compute_bootstrap_value(sample_batch, self)
+        return sample_batch
+    @override(TorchPolicyV2)
+    def extra_grad_process(
+        self, optimizer: "torch.optim.Optimizer", loss: TensorType
+    ) -> Dict[str, TensorType]:
+        return apply_grad_clipping(self, optimizer, loss)
+    @override(TorchPolicyV2)
+    def get_batch_divisibility_req(self) -> int:
+        return self.config["rollout_fragment_length"]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/default_appo_rl_module.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import abc
+from typing import Any, Dict, List, Tuple
+from ray.rllib.algorithms.ppo.default_ppo_rl_module import DefaultPPORLModule
+from ray.rllib.core.learner.utils import make_target_network
+from ray.rllib.core.models.base import ACTOR
+from ray.rllib.core.models.tf.encoder import ENCODER_OUT
+from ray.rllib.core.rl_module.apis import (
+    TARGET_NETWORK_ACTION_DIST_INPUTS,
+    TargetNetworkAPI,
+)
+from ray.rllib.utils.typing import NetworkType
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+class DefaultAPPORLModule(DefaultPPORLModule, TargetNetworkAPI, abc.ABC):
+    """Default RLModule used by APPO, if user does not specify a custom RLModule.
+    Users who want to train their RLModules with APPO may implement any RLModule
+    (or TorchRLModule) subclass as long as the custom class also implements the
+    `ValueFunctionAPI` (see ray.rllib.core.rl_module.apis.value_function_api.py)
+    and the `TargetNetworkAPI` (see
+    ray.rllib.core.rl_module.apis.target_network_api.py).
+    """
+    @override(TargetNetworkAPI)
+    def make_target_networks(self):
+        self._old_encoder = make_target_network(self.encoder)
+        self._old_pi = make_target_network(self.pi)
+    @override(TargetNetworkAPI)
+    def get_target_network_pairs(self) -> List[Tuple[NetworkType, NetworkType]]:
+        return [
+            (self.encoder, self._old_encoder),
+            (self.pi, self._old_pi),
+        ]
+    @override(TargetNetworkAPI)
+    def forward_target(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        old_pi_inputs_encoded = self._old_encoder(batch)[ENCODER_OUT][ACTOR]
+        old_action_dist_logits = self._old_pi(old_pi_inputs_encoded)
+        return {TARGET_NETWORK_ACTION_DIST_INPUTS: old_action_dist_logits}
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    @override(DefaultPPORLModule)
+    def get_non_inference_attributes(self) -> List[str]:
+        # Get the NON inference-only attributes from the parent class
+        # `PPOTorchRLModule`.
+        ret = super().get_non_inference_attributes()
+        # Add the two (APPO) target networks to it (NOT needed in
+        # inference-only mode).
+        ret += ["_old_encoder", "_old_pi"]
+        return ret

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (204 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_learner.cpython-311.pyc ADDED Viewed

Binary file (9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_rl_module.cpython-311.pyc ADDED Viewed

Binary file (709 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/default_appo_torch_rl_module.cpython-311.pyc ADDED Viewed

Binary file (818 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_learner.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""Asynchronous Proximal Policy Optimization (APPO)
+The algorithm is described in [1] (under the name of "IMPACT"):
+Detailed documentation:
+https://docs.ray.io/en/master/rllib-algorithms.html#appo
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
+"""
+from typing import Dict
+from ray.rllib.algorithms.appo.appo import (
+    APPOConfig,
+    LEARNER_RESULTS_CURR_KL_COEFF_KEY,
+    LEARNER_RESULTS_KL_KEY,
+)
+from ray.rllib.algorithms.appo.appo_learner import APPOLearner
+from ray.rllib.algorithms.impala.torch.impala_torch_learner import IMPALATorchLearner
+from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import (
+    make_time_major,
+    vtrace_torch,
+)
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.learner import POLICY_LOSS_KEY, VF_LOSS_KEY, ENTROPY_KEY
+from ray.rllib.core.rl_module.apis import (
+    TARGET_NETWORK_ACTION_DIST_INPUTS,
+    TargetNetworkAPI,
+    ValueFunctionAPI,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.typing import ModuleID, TensorType
+torch, nn = try_import_torch()
+class APPOTorchLearner(APPOLearner, IMPALATorchLearner):
+    """Implements APPO loss / update logic on top of IMPALATorchLearner."""
+    @override(IMPALATorchLearner)
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: APPOConfig,
+        batch: Dict,
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        module = self.module[module_id].unwrapped()
+        assert isinstance(module, TargetNetworkAPI)
+        assert isinstance(module, ValueFunctionAPI)
+        # TODO (sven): Now that we do the +1ts trick to be less vulnerable about
+        #  bootstrap values at the end of rollouts in the new stack, we might make
+        #  this a more flexible, configurable parameter for users, e.g.
+        #  `v_trace_seq_len` (independent of `rollout_fragment_length`). Separation
+        #  of concerns (sampling vs learning).
+        rollout_frag_or_episode_len = config.get_rollout_fragment_length()
+        recurrent_seq_len = batch.get("seq_lens")
+        loss_mask = batch[Columns.LOSS_MASK].float()
+        loss_mask_time_major = make_time_major(
+            loss_mask,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        size_loss_mask = torch.sum(loss_mask)
+        values = module.compute_values(
+            batch, embeddings=fwd_out.get(Columns.EMBEDDINGS)
+        )
+        action_dist_cls_train = module.get_train_action_dist_cls()
+        target_policy_dist = action_dist_cls_train.from_logits(
+            fwd_out[Columns.ACTION_DIST_INPUTS]
+        )
+        old_target_policy_dist = action_dist_cls_train.from_logits(
+            module.forward_target(batch)[TARGET_NETWORK_ACTION_DIST_INPUTS]
+        )
+        old_target_policy_actions_logp = old_target_policy_dist.logp(
+            batch[Columns.ACTIONS]
+        )
+        behaviour_actions_logp = batch[Columns.ACTION_LOGP]
+        target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS])
+        behaviour_actions_logp_time_major = make_time_major(
+            behaviour_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        target_actions_logp_time_major = make_time_major(
+            target_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        old_actions_logp_time_major = make_time_major(
+            old_target_policy_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        rewards_time_major = make_time_major(
+            batch[Columns.REWARDS],
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        values_time_major = make_time_major(
+            values,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        assert Columns.VALUES_BOOTSTRAPPED not in batch
+        # Use as bootstrap values the vf-preds in the next "batch row", except
+        # for the very last row (which doesn't have a next row), for which the
+        # bootstrap value does not matter b/c it has a +1ts value at its end
+        # anyways. So we chose an arbitrary item (for simplicity of not having to
+        # move new data to the device).
+        bootstrap_values = torch.cat(
+            [
+                values_time_major[0][1:],  # 0th ts values from "next row"
+                values_time_major[0][0:1],  # <- can use any arbitrary value here
+            ],
+            dim=0,
+        )
+        # The discount factor that is used should be gamma except for timesteps where
+        # the episode is terminated. In that case, the discount factor should be 0.
+        discounts_time_major = (
+            1.0
+            - make_time_major(
+                batch[Columns.TERMINATEDS],
+                trajectory_len=rollout_frag_or_episode_len,
+                recurrent_seq_len=recurrent_seq_len,
+            ).float()
+        ) * config.gamma
+        # Note that vtrace will compute the main loop on the CPU for better performance.
+        vtrace_adjusted_target_values, pg_advantages = vtrace_torch(
+            target_action_log_probs=old_actions_logp_time_major,
+            behaviour_action_log_probs=behaviour_actions_logp_time_major,
+            discounts=discounts_time_major,
+            rewards=rewards_time_major,
+            values=values_time_major,
+            bootstrap_values=bootstrap_values,
+            clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold,
+            clip_rho_threshold=config.vtrace_clip_rho_threshold,
+        )
+        pg_advantages = pg_advantages * loss_mask_time_major
+        # The policy gradients loss.
+        is_ratio = torch.clip(
+            torch.exp(behaviour_actions_logp_time_major - old_actions_logp_time_major),
+            0.0,
+            2.0,
+        )
+        logp_ratio = is_ratio * torch.exp(
+            target_actions_logp_time_major - behaviour_actions_logp_time_major
+        )
+        surrogate_loss = torch.minimum(
+            pg_advantages * logp_ratio,
+            pg_advantages
+            * torch.clip(logp_ratio, 1 - config.clip_param, 1 + config.clip_param),
+        )
+        if config.use_kl_loss:
+            action_kl = old_target_policy_dist.kl(target_policy_dist) * loss_mask
+            mean_kl_loss = torch.sum(action_kl) / size_loss_mask
+        else:
+            mean_kl_loss = 0.0
+        mean_pi_loss = -(torch.sum(surrogate_loss) / size_loss_mask)
+        # The baseline loss.
+        delta = values_time_major - vtrace_adjusted_target_values
+        vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major)
+        mean_vf_loss = vf_loss / size_loss_mask
+        # The entropy loss.
+        mean_entropy_loss = (
+            -torch.sum(target_policy_dist.entropy() * loss_mask) / size_loss_mask
+        )
+        # The summed weighted loss.
+        total_loss = (
+            mean_pi_loss
+            + (mean_vf_loss * config.vf_loss_coeff)
+            + (
+                mean_entropy_loss
+                * self.entropy_coeff_schedulers_per_module[
+                    module_id
+                ].get_current_value()
+            )
+            + (mean_kl_loss * self.curr_kl_coeffs_per_module[module_id])
+        )
+        # Log important loss stats.
+        self.metrics.log_dict(
+            {
+                POLICY_LOSS_KEY: mean_pi_loss,
+                VF_LOSS_KEY: mean_vf_loss,
+                ENTROPY_KEY: -mean_entropy_loss,
+                LEARNER_RESULTS_KL_KEY: mean_kl_loss,
+                LEARNER_RESULTS_CURR_KL_COEFF_KEY: (
+                    self.curr_kl_coeffs_per_module[module_id]
+                ),
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        # Return the total loss.
+        return total_loss
+    @override(APPOLearner)
+    def _update_module_kl_coeff(self, module_id: ModuleID, config: APPOConfig) -> None:
+        # Update the current KL value based on the recently measured value.
+        # Increase.
+        kl = convert_to_numpy(self.metrics.peek((module_id, LEARNER_RESULTS_KL_KEY)))
+        kl_coeff_var = self.curr_kl_coeffs_per_module[module_id]
+        if kl > 2.0 * config.kl_target:
+            # TODO (Kourosh) why not *2.0?
+            kl_coeff_var.data *= 1.5
+        # Decrease.
+        elif kl < 0.5 * config.kl_target:
+            kl_coeff_var.data *= 0.5
+        self.metrics.log_value(
+            (module_id, LEARNER_RESULTS_CURR_KL_COEFF_KEY),
+            kl_coeff_var.item(),
+            window=1,
+        )

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_rl_module.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Backward compat import.
+from ray.rllib.algorithms.appo.torch.default_appo_torch_rl_module import (  # noqa
+    DefaultAPPOTorchRLModule as APPOTorchRLModule,
+)
+from ray.rllib.utils.deprecation import deprecation_warning
+deprecation_warning(
+    old="ray.rllib.algorithms.appo.torch.appo_torch_rl_module.APPOTorchRLModule",
+    new="ray.rllib.algorithms.appo.torch.default_appo_torch_rl_module."
+    "DefaultAPPOTorchRLModule",
+    error=False,
+)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/default_appo_torch_rl_module.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ray.rllib.algorithms.appo.default_appo_rl_module import DefaultAPPORLModule
+from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import (
+    DefaultPPOTorchRLModule,
+)
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+class DefaultAPPOTorchRLModule(DefaultPPOTorchRLModule, DefaultAPPORLModule):
+    pass

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""
+[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
+Luo et al. 2020
+https://arxiv.org/pdf/1912.00167
+"""
+from collections import deque
+import random
+import threading
+import time
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.utils.annotations import OldAPIStack
+POLICY_SCOPE = "func"
+TARGET_POLICY_SCOPE = "target_func"
+class CircularBuffer:
+    """A circular batch-wise buffer as described in [1] for APPO.
+    The buffer holds at most N batches, which are sampled at random (uniformly).
+    If full and a new batch is added, the oldest batch is discarded. Also, each batch
+    currently in the buffer can be sampled at most K times (after which it is also
+    discarded).
+    """
+    def __init__(self, num_batches: int, iterations_per_batch: int):
+        # N from the paper (buffer size).
+        self.num_batches = num_batches
+        # K ("replay coefficient") from the paper.
+        self.iterations_per_batch = iterations_per_batch
+        self._buffer = deque(maxlen=self.num_batches)
+        self._lock = threading.Lock()
+        # The number of valid (not expired) entries in this buffer.
+        self._num_valid_batches = 0
+    def add(self, batch):
+        dropped_entry = None
+        dropped_ts = 0
+        # Add buffer and k=0 information to the deque.
+        with self._lock:
+            len_ = len(self._buffer)
+            if len_ == self.num_batches:
+                dropped_entry = self._buffer[0]
+            self._buffer.append([batch, 0])
+            self._num_valid_batches += 1
+        # A valid entry (w/ a batch whose k has not been reach K yet) was dropped.
+        if dropped_entry is not None and dropped_entry[0] is not None:
+            dropped_ts += dropped_entry[0].env_steps() * (
+                self.iterations_per_batch - dropped_entry[1]
+            )
+            self._num_valid_batches -= 1
+        return dropped_ts
+    def sample(self):
+        k = entry = batch = None
+        while True:
+            # Only initially, the buffer may be empty -> Just wait for some time.
+            if len(self) == 0:
+                time.sleep(0.001)
+                continue
+            # Sample a random buffer index.
+            with self._lock:
+                entry = self._buffer[random.randint(0, len(self._buffer) - 1)]
+            batch, k = entry
+            # Ignore batches that have already been invalidated.
+            if batch is not None:
+                break
+        # Increase k += 1 for this batch.
+        assert k is not None
+        entry[1] += 1
+        # This batch has been exhausted (k == K) -> Invalidate it in the buffer.
+        if k == self.iterations_per_batch - 1:
+            entry[0] = None
+            entry[1] = None
+            self._num_valid_batches += 1
+        # Return the sampled batch.
+        return batch
+    def __len__(self) -> int:
+        """Returns the number of actually valid (non-expired) batches in the buffer."""
+        return self._num_valid_batches
+@OldAPIStack
+def make_appo_models(policy) -> ModelV2:
+    """Builds model and target model for APPO.
+    Returns:
+        ModelV2: The Model for the Policy to use.
+            Note: The target model will not be returned, just assigned to
+            `policy.target_model`.
+    """
+    # Get the num_outputs for the following model construction calls.
+    _, logit_dim = ModelCatalog.get_action_dist(
+        policy.action_space, policy.config["model"]
+    )
+    # Construct the (main) model.
+    policy.model = ModelCatalog.get_model_v2(
+        policy.observation_space,
+        policy.action_space,
+        logit_dim,
+        policy.config["model"],
+        name=POLICY_SCOPE,
+        framework=policy.framework,
+    )
+    policy.model_variables = policy.model.variables()
+    # Construct the target model.
+    policy.target_model = ModelCatalog.get_model_v2(
+        policy.observation_space,
+        policy.action_space,
+        logit_dim,
+        policy.config["model"],
+        name=TARGET_POLICY_SCOPE,
+        framework=policy.framework,
+    )
+    policy.target_model_variables = policy.target_model.variables()
+    # Return only the model (not the target model).
+    return policy.model

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (672 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3.cpython-311.pyc ADDED Viewed

Binary file (32.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_catalog.cpython-311.pyc ADDED Viewed

Binary file (3.65 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_learner.cpython-311.pyc ADDED Viewed

Binary file (1.93 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_rl_module.cpython-311.pyc ADDED Viewed

Binary file (8.11 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (213 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/actor_network.cpython-311.pyc ADDED Viewed

Binary file (8.9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/critic_network.cpython-311.pyc ADDED Viewed

Binary file (8.85 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/disagree_networks.cpython-311.pyc ADDED Viewed

Binary file (4.82 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/dreamer_model.cpython-311.pyc ADDED Viewed

Binary file (25 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/world_model.cpython-311.pyc ADDED Viewed

Binary file (20.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (224 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/cnn_atari.cpython-311.pyc ADDED Viewed

Binary file (4.84 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/continue_predictor.cpython-311.pyc ADDED Viewed

Binary file (4.91 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/conv_transpose_atari.cpython-311.pyc ADDED Viewed

Binary file (7.44 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/dynamics_predictor.cpython-311.pyc ADDED Viewed

Binary file (4.23 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/mlp.cpython-311.pyc ADDED Viewed

Binary file (4.66 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/representation_layer.cpython-311.pyc ADDED Viewed

Binary file (5.96 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor.cpython-311.pyc ADDED Viewed

Binary file (5.52 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor_layer.cpython-311.pyc ADDED Viewed

Binary file (4.96 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/sequence_model.cpython-311.pyc ADDED Viewed

Binary file (6.51 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/vector_decoder.cpython-311.pyc ADDED Viewed

Binary file (4.59 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_gru_units,
+    get_num_z_classes,
+    get_num_z_categoricals,
+)
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
+class ContinuePredictor(tf.keras.Model):
+    """The world-model network sub-component used to predict the `continue` flags .
+    Predicted continue flags are used to produce "dream data" to learn the policy in.
+    The continue flags are predicted via a linear output used to parameterize a
+    Bernoulli distribution, from which simply the mode is used (no stochastic
+    sampling!). In other words, if the sigmoid of the output of the linear layer is
+    >0.5, we predict a continuation of the episode, otherwise we predict an episode
+    terminal.
+    """
+    def __init__(self, *, model_size: str = "XS"):
+        """Initializes a ContinuePredictor instance.
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Determines the exact size of the underlying MLP.
+        """
+        super().__init__(name="continue_predictor")
+        self.model_size = model_size
+        self.mlp = MLP(model_size=model_size, output_layer_size=1)
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type),
+                tf.TensorSpec(
+                    shape=[
+                        None,
+                        get_num_z_categoricals(model_size),
+                        get_num_z_classes(model_size),
+                    ],
+                    dtype=dl_type,
+                ),
+            ]
+        )(self.call)
+    def call(self, h, z):
+        """Performs a forward pass through the continue predictor.
+        Args:
+            h: The deterministic hidden state of the sequence model. [B, dim(h)].
+            z: The stochastic discrete representations of the original
+                observation input. [B, num_categoricals, num_classes].
+        """
+        # Flatten last two dims of z.
+        assert len(z.shape) == 3
+        z_shape = tf.shape(z)
+        z = tf.reshape(z, shape=(z_shape[0], -1))
+        assert len(z.shape) == 2
+        out = tf.concat([h, z], axis=-1)
+        out.set_shape(
+            [
+                None,
+                (
+                    get_num_z_categoricals(self.model_size)
+                    * get_num_z_classes(self.model_size)
+                    + get_gru_units(self.model_size)
+                ),
+            ]
+        )
+        # Send h-cat-z through MLP.
+        out = self.mlp(out)
+        # Remove the extra [B, 1] dimension at the end to get a proper Bernoulli
+        # distribution. Otherwise, tfp will think that the batch dims are [B, 1]
+        # where they should be just [B].
+        logits = tf.cast(tf.squeeze(out, axis=-1), tf.float32)
+        # Create the Bernoulli distribution object.
+        bernoulli = tfp.distributions.Bernoulli(logits=logits, dtype=tf.float32)
+        # Take the mode (greedy, deterministic "sample").
+        continue_ = bernoulli.mode()
+        # Return Bernoulli sample (whether to continue) OR (continue?, Bernoulli prob).
+        return continue_, bernoulli