diff --git a/.gitattributes b/.gitattributes index 2b04037090d13ae8fad315e9702817de175b1be2..7c997f5173693385cf8f34df08526d910db5986d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -175,3 +175,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..585aec6750dec82942d931457abb7d9446298822 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebb34d8a5e73fa6657fb50dde3c5afc10ca55bef89431f9fbe15555295f4da0e +size 168124 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fe4eb9b9a10fb5482dce8bbb4b89589207cbc84f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__init__.py @@ -0,0 +1,12 @@ +from ray.rllib.algorithms.appo.appo import APPO, APPOConfig +from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy, APPOTF2Policy +from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy + +__all__ = [ + "APPO", + "APPOConfig", + # @OldAPIStack + "APPOTF1Policy", + "APPOTF2Policy", + "APPOTorchPolicy", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..faf324e310c7da70cc66b3a38077b9bd6f131a15 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3373ddf001b94947662a300c4208872d70759ffd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..361a97b1a678da06572e9364d3f990254b478edc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_rl_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1dd9017244d2b9b8479958a20f19cb34841bcca7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_rl_module.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_tf_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_tf_policy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d97032d0c7ce61a2f7ebb822576f6cd40fbe4ff0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_tf_policy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_torch_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_torch_policy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cad22493c3280c8028cc6b4338d31a478d478ff Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/appo_torch_policy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/default_appo_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/default_appo_rl_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22ec0c8b6fe65d136fb8d459ac3929905a8c4f20 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/default_appo_rl_module.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..387a216f2e081375a2096e5b1cbf1183cce7e04e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.py new file mode 100644 index 0000000000000000000000000000000000000000..68267b876637a09c4e116a726e9f85ea0a049e8c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo.py @@ -0,0 +1,434 @@ +"""Asynchronous Proximal Policy Optimization (APPO) + +The algorithm is described in [1] (under the name of "IMPACT"): + +Detailed documentation: +https://docs.ray.io/en/master/rllib-algorithms.html#appo + +[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks. +Luo et al. 2020 +https://arxiv.org/pdf/1912.00167 +""" + +from typing import Optional, Type +import logging + +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided +from ray.rllib.algorithms.impala.impala import IMPALA, IMPALAConfig +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.policy.policy import Policy +from ray.rllib.utils.annotations import override +from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning +from ray.rllib.utils.metrics import ( + LAST_TARGET_UPDATE_TS, + NUM_AGENT_STEPS_SAMPLED, + NUM_ENV_STEPS_SAMPLED, + NUM_TARGET_UPDATES, +) +from ray.rllib.utils.metrics import LEARNER_STATS_KEY + +logger = logging.getLogger(__name__) + + +LEARNER_RESULTS_KL_KEY = "mean_kl_loss" +LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff" +OLD_ACTION_DIST_KEY = "old_action_dist" + + +class APPOConfig(IMPALAConfig): + """Defines a configuration class from which an APPO Algorithm can be built. + + .. testcode:: + + from ray.rllib.algorithms.appo import APPOConfig + config = ( + APPOConfig() + .training(lr=0.01, grad_clip=30.0, train_batch_size_per_learner=50) + ) + config = config.learners(num_learners=1) + config = config.env_runners(num_env_runners=1) + config = config.environment("CartPole-v1") + + # Build an Algorithm object from the config and run 1 training iteration. + algo = config.build() + algo.train() + del algo + + .. testcode:: + + from ray.rllib.algorithms.appo import APPOConfig + from ray import air + from ray import tune + + config = APPOConfig() + # Update the config object. + config = config.training(lr=tune.grid_search([0.001,])) + # Set the config object's env. + config = config.environment(env="CartPole-v1") + # Use to_dict() to get the old-style python config dict when running with tune. + tune.Tuner( + "APPO", + run_config=air.RunConfig( + stop={"training_iteration": 1}, + verbose=0, + ), + param_space=config.to_dict(), + + ).fit() + + .. testoutput:: + :hide: + + ... + """ + + def __init__(self, algo_class=None): + """Initializes a APPOConfig instance.""" + self.exploration_config = { + # The Exploration class to use. In the simplest case, this is the name + # (str) of any class present in the `rllib.utils.exploration` package. + # You can also provide the python class directly or the full location + # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. + # EpsilonGreedy"). + "type": "StochasticSampling", + # Add constructor kwargs here (if any). + } + + super().__init__(algo_class=algo_class or APPO) + + # fmt: off + # __sphinx_doc_begin__ + # APPO specific settings: + self.vtrace = True + self.use_gae = True + self.lambda_ = 1.0 + self.clip_param = 0.4 + self.use_kl_loss = False + self.kl_coeff = 1.0 + self.kl_target = 0.01 + self.target_worker_clipping = 2.0 + + # Circular replay buffer settings. + # Used in [1] for discrete action tasks: + # `circular_buffer_num_batches=4` and `circular_buffer_iterations_per_batch=2` + # For cont. action tasks: + # `circular_buffer_num_batches=16` and `circular_buffer_iterations_per_batch=20` + self.circular_buffer_num_batches = 4 + self.circular_buffer_iterations_per_batch = 2 + + # Override some of IMPALAConfig's default values with APPO-specific values. + self.num_env_runners = 2 + self.target_network_update_freq = 2 + self.broadcast_interval = 1 + self.grad_clip = 40.0 + # Note: Only when using enable_rl_module_and_learner=True can the clipping mode + # be configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" + + self.opt_type = "adam" + self.lr = 0.0005 + self.decay = 0.99 + self.momentum = 0.0 + self.epsilon = 0.1 + self.vf_loss_coeff = 0.5 + self.entropy_coeff = 0.01 + self.tau = 1.0 + # __sphinx_doc_end__ + # fmt: on + + self.lr_schedule = None # @OldAPIStack + self.entropy_coeff_schedule = None # @OldAPIStack + self.num_gpus = 0 # @OldAPIStack + self.num_multi_gpu_tower_stacks = 1 # @OldAPIStack + self.minibatch_buffer_size = 1 # @OldAPIStack + self.replay_proportion = 0.0 # @OldAPIStack + self.replay_buffer_num_slots = 100 # @OldAPIStack + self.learner_queue_size = 16 # @OldAPIStack + self.learner_queue_timeout = 300 # @OldAPIStack + + # Deprecated keys. + self.target_update_frequency = DEPRECATED_VALUE + self.use_critic = DEPRECATED_VALUE + + @override(IMPALAConfig) + def training( + self, + *, + vtrace: Optional[bool] = NotProvided, + use_gae: Optional[bool] = NotProvided, + lambda_: Optional[float] = NotProvided, + clip_param: Optional[float] = NotProvided, + use_kl_loss: Optional[bool] = NotProvided, + kl_coeff: Optional[float] = NotProvided, + kl_target: Optional[float] = NotProvided, + target_network_update_freq: Optional[int] = NotProvided, + tau: Optional[float] = NotProvided, + target_worker_clipping: Optional[float] = NotProvided, + circular_buffer_num_batches: Optional[int] = NotProvided, + circular_buffer_iterations_per_batch: Optional[int] = NotProvided, + # Deprecated keys. + target_update_frequency=DEPRECATED_VALUE, + use_critic=DEPRECATED_VALUE, + **kwargs, + ) -> "APPOConfig": + """Sets the training related configuration. + + Args: + vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE + advantages will be used instead. + use_gae: If true, use the Generalized Advantage Estimator (GAE) + with a value function, see https://arxiv.org/pdf/1506.02438.pdf. + Only applies if vtrace=False. + lambda_: GAE (lambda) parameter. + clip_param: PPO surrogate slipping parameter. + use_kl_loss: Whether to use the KL-term in the loss function. + kl_coeff: Coefficient for weighting the KL-loss term. + kl_target: Target term for the KL-term to reach (via adjusting the + `kl_coeff` automatically). + target_network_update_freq: NOTE: This parameter is only applicable on + the new API stack. The frequency with which to update the target + policy network from the main trained policy network. The metric + used is `NUM_ENV_STEPS_TRAINED_LIFETIME` and the unit is `n` (see [1] + 4.1.1), where: `n = [circular_buffer_num_batches (N)] * + [circular_buffer_iterations_per_batch (K)] * [train batch size]` + For example, if you set `target_network_update_freq=2`, and N=4, K=2, + and `train_batch_size_per_learner=500`, then the target net is updated + every 2*4*2*500=8000 trained env steps (every 16 batch updates on each + learner). + The authors in [1] suggests that this setting is robust to a range of + choices (try values between 0.125 and 4). + target_network_update_freq: The frequency to update the target policy and + tune the kl loss coefficients that are used during training. After + setting this parameter, the algorithm waits for at least + `target_network_update_freq` number of environment samples to be trained + on before updating the target networks and tune the kl loss + coefficients. NOTE: This parameter is only applicable when using the + Learner API (enable_rl_module_and_learner=True). + tau: The factor by which to update the target policy network towards + the current policy network. Can range between 0 and 1. + e.g. updated_param = tau * current_param + (1 - tau) * target_param + target_worker_clipping: The maximum value for the target-worker-clipping + used for computing the IS ratio, described in [1] + IS = min(π(i) / π(target), ρ) * (π / π(i)) + circular_buffer_num_batches: The number of train batches that fit + into the circular buffer. Each such train batch can be sampled for + training max. `circular_buffer_iterations_per_batch` times. + circular_buffer_iterations_per_batch: The number of times any train + batch in the circular buffer can be sampled for training. A batch gets + evicted from the buffer either if it's the oldest batch in the buffer + and a new batch is added OR if the batch reaches this max. number of + being sampled. + + Returns: + This updated AlgorithmConfig object. + """ + if target_update_frequency != DEPRECATED_VALUE: + deprecation_warning( + old="target_update_frequency", + new="target_network_update_freq", + error=True, + ) + if use_critic != DEPRECATED_VALUE: + deprecation_warning( + old="use_critic", + help="`use_critic` no longer supported! APPO always uses a value " + "function (critic).", + error=True, + ) + + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if vtrace is not NotProvided: + self.vtrace = vtrace + if use_gae is not NotProvided: + self.use_gae = use_gae + if lambda_ is not NotProvided: + self.lambda_ = lambda_ + if clip_param is not NotProvided: + self.clip_param = clip_param + if use_kl_loss is not NotProvided: + self.use_kl_loss = use_kl_loss + if kl_coeff is not NotProvided: + self.kl_coeff = kl_coeff + if kl_target is not NotProvided: + self.kl_target = kl_target + if target_network_update_freq is not NotProvided: + self.target_network_update_freq = target_network_update_freq + if tau is not NotProvided: + self.tau = tau + if target_worker_clipping is not NotProvided: + self.target_worker_clipping = target_worker_clipping + if circular_buffer_num_batches is not NotProvided: + self.circular_buffer_num_batches = circular_buffer_num_batches + if circular_buffer_iterations_per_batch is not NotProvided: + self.circular_buffer_iterations_per_batch = ( + circular_buffer_iterations_per_batch + ) + + return self + + @override(IMPALAConfig) + def validate(self) -> None: + super().validate() + + # On new API stack, circular buffer should be used, not `minibatch_buffer_size`. + if self.enable_rl_module_and_learner: + if self.minibatch_buffer_size != 1 or self.replay_proportion != 0.0: + self._value_error( + "`minibatch_buffer_size/replay_proportion` not valid on new API " + "stack with APPO! " + "Use `circular_buffer_num_batches` for the number of train batches " + "in the circular buffer. To change the maximum number of times " + "any batch may be sampled, set " + "`circular_buffer_iterations_per_batch`." + ) + if self.num_multi_gpu_tower_stacks != 1: + self._value_error( + "`num_multi_gpu_tower_stacks` not supported on new API stack with " + "APPO! In order to train on multi-GPU, use " + "`config.learners(num_learners=[number of GPUs], " + "num_gpus_per_learner=1)`. To scale the throughput of batch-to-GPU-" + "pre-loading on each of your `Learners`, set " + "`num_gpu_loader_threads` to a higher number (recommended values: " + "1-8)." + ) + if self.learner_queue_size != 16: + self._value_error( + "`learner_queue_size` not supported on new API stack with " + "APPO! In order set the size of the circular buffer (which acts as " + "a 'learner queue'), use " + "`config.training(circular_buffer_num_batches=..)`. To change the " + "maximum number of times any batch may be sampled, set " + "`config.training(circular_buffer_iterations_per_batch=..)`." + ) + + @override(IMPALAConfig) + def get_default_learner_class(self): + if self.framework_str == "torch": + from ray.rllib.algorithms.appo.torch.appo_torch_learner import ( + APPOTorchLearner, + ) + + return APPOTorchLearner + elif self.framework_str in ["tf2", "tf"]: + raise ValueError( + "TensorFlow is no longer supported on the new API stack! " + "Use `framework='torch'`." + ) + else: + raise ValueError( + f"The framework {self.framework_str} is not supported. " + "Use `framework='torch'`." + ) + + @override(IMPALAConfig) + def get_default_rl_module_spec(self) -> RLModuleSpec: + if self.framework_str == "torch": + from ray.rllib.algorithms.appo.torch.appo_torch_rl_module import ( + APPOTorchRLModule as RLModule, + ) + else: + raise ValueError( + f"The framework {self.framework_str} is not supported. " + "Use either 'torch' or 'tf2'." + ) + + return RLModuleSpec(module_class=RLModule) + + @property + @override(AlgorithmConfig) + def _model_config_auto_includes(self): + return super()._model_config_auto_includes | {"vf_share_layers": False} + + +class APPO(IMPALA): + def __init__(self, config, *args, **kwargs): + """Initializes an APPO instance.""" + super().__init__(config, *args, **kwargs) + + # After init: Initialize target net. + + # TODO(avnishn): Does this need to happen in __init__? I think we can move it + # to setup() + if not self.config.enable_rl_module_and_learner: + self.env_runner.foreach_policy_to_train(lambda p, _: p.update_target()) + + @override(IMPALA) + def training_step(self) -> None: + if self.config.enable_rl_module_and_learner: + return super().training_step() + + train_results = super().training_step() + # Update the target network and the KL coefficient for the APPO-loss. + # The target network update frequency is calculated automatically by the product + # of `num_epochs` setting (usually 1 for APPO) and `minibatch_buffer_size`. + last_update = self._counters[LAST_TARGET_UPDATE_TS] + cur_ts = self._counters[ + ( + NUM_AGENT_STEPS_SAMPLED + if self.config.count_steps_by == "agent_steps" + else NUM_ENV_STEPS_SAMPLED + ) + ] + target_update_freq = self.config.num_epochs * self.config.minibatch_buffer_size + if cur_ts - last_update > target_update_freq: + self._counters[NUM_TARGET_UPDATES] += 1 + self._counters[LAST_TARGET_UPDATE_TS] = cur_ts + + # Update our target network. + self.env_runner.foreach_policy_to_train(lambda p, _: p.update_target()) + + # Also update the KL-coefficient for the APPO loss, if necessary. + if self.config.use_kl_loss: + + def update(pi, pi_id): + assert LEARNER_STATS_KEY not in train_results, ( + "{} should be nested under policy id key".format( + LEARNER_STATS_KEY + ), + train_results, + ) + if pi_id in train_results: + kl = train_results[pi_id][LEARNER_STATS_KEY].get("kl") + assert kl is not None, (train_results, pi_id) + # Make the actual `Policy.update_kl()` call. + pi.update_kl(kl) + else: + logger.warning("No data for {}, not updating kl".format(pi_id)) + + # Update KL on all trainable policies within the local (trainer) + # Worker. + self.env_runner.foreach_policy_to_train(update) + + return train_results + + @classmethod + @override(IMPALA) + def get_default_config(cls) -> AlgorithmConfig: + return APPOConfig() + + @classmethod + @override(IMPALA) + def get_default_policy_class( + cls, config: AlgorithmConfig + ) -> Optional[Type[Policy]]: + if config["framework"] == "torch": + from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy + + return APPOTorchPolicy + elif config["framework"] == "tf": + if config.enable_rl_module_and_learner: + raise ValueError( + "RLlib's RLModule and Learner API is not supported for" + " tf1. Use " + "framework='tf2' instead." + ) + from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF1Policy + + return APPOTF1Policy + else: + from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF2Policy + + return APPOTF2Policy diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py new file mode 100644 index 0000000000000000000000000000000000000000..235bc823209f8a47c9fe189c0c9cd5b6b4804170 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_learner.py @@ -0,0 +1,147 @@ +import abc +from typing import Any, Dict, Optional + +from ray.rllib.algorithms.appo.appo import APPOConfig +from ray.rllib.algorithms.appo.utils import CircularBuffer +from ray.rllib.algorithms.impala.impala_learner import IMPALALearner +from ray.rllib.core.learner.learner import Learner +from ray.rllib.core.learner.utils import update_target_network +from ray.rllib.core.rl_module.apis import TargetNetworkAPI, ValueFunctionAPI +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.utils.annotations import override +from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict +from ray.rllib.utils.metrics import ( + LAST_TARGET_UPDATE_TS, + NUM_ENV_STEPS_TRAINED_LIFETIME, + NUM_MODULE_STEPS_TRAINED, + NUM_TARGET_UPDATES, +) +from ray.rllib.utils.schedules.scheduler import Scheduler +from ray.rllib.utils.typing import ModuleID, ShouldModuleBeUpdatedFn + + +class APPOLearner(IMPALALearner): + """Adds KL coeff updates via `after_gradient_based_update()` to IMPALA logic. + + Framework-specific subclasses must override `_update_module_kl_coeff()`. + """ + + @override(IMPALALearner) + def build(self): + self._learner_thread_in_queue = CircularBuffer( + num_batches=self.config.circular_buffer_num_batches, + iterations_per_batch=self.config.circular_buffer_iterations_per_batch, + ) + + super().build() + + # Make target networks. + self.module.foreach_module( + lambda mid, mod: ( + mod.make_target_networks() + if isinstance(mod, TargetNetworkAPI) + else None + ) + ) + + # The current kl coefficients per module as (framework specific) tensor + # variables. + self.curr_kl_coeffs_per_module: LambdaDefaultDict[ + ModuleID, Scheduler + ] = LambdaDefaultDict( + lambda module_id: self._get_tensor_variable( + self.config.get_config_for_module(module_id).kl_coeff + ) + ) + + @override(Learner) + def add_module( + self, + *, + module_id: ModuleID, + module_spec: RLModuleSpec, + config_overrides: Optional[Dict] = None, + new_should_module_be_updated: Optional[ShouldModuleBeUpdatedFn] = None, + ) -> MultiRLModuleSpec: + marl_spec = super().add_module( + module_id=module_id, + module_spec=module_spec, + config_overrides=config_overrides, + new_should_module_be_updated=new_should_module_be_updated, + ) + # Create target networks for added Module, if applicable. + if isinstance(self.module[module_id].unwrapped(), TargetNetworkAPI): + self.module[module_id].unwrapped().make_target_networks() + return marl_spec + + @override(IMPALALearner) + def remove_module(self, module_id: str) -> MultiRLModuleSpec: + marl_spec = super().remove_module(module_id) + self.curr_kl_coeffs_per_module.pop(module_id) + return marl_spec + + @override(Learner) + def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None: + """Updates the target Q Networks.""" + super().after_gradient_based_update(timesteps=timesteps) + + # TODO (sven): Maybe we should have a `after_gradient_based_update` + # method per module? + curr_timestep = timesteps.get(NUM_ENV_STEPS_TRAINED_LIFETIME, 0) + for module_id, module in self.module._rl_modules.items(): + config = self.config.get_config_for_module(module_id) + + last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS) + if isinstance(module.unwrapped(), TargetNetworkAPI) and ( + curr_timestep - self.metrics.peek(last_update_ts_key, default=0) + >= ( + config.target_network_update_freq + * config.circular_buffer_num_batches + * config.circular_buffer_iterations_per_batch + * config.train_batch_size_per_learner + ) + ): + for ( + main_net, + target_net, + ) in module.unwrapped().get_target_network_pairs(): + update_target_network( + main_net=main_net, + target_net=target_net, + tau=config.tau, + ) + # Increase lifetime target network update counter by one. + self.metrics.log_value((module_id, NUM_TARGET_UPDATES), 1, reduce="sum") + # Update the (single-value -> window=1) last updated timestep metric. + self.metrics.log_value(last_update_ts_key, curr_timestep, window=1) + + if ( + config.use_kl_loss + and self.metrics.peek((module_id, NUM_MODULE_STEPS_TRAINED), default=0) + > 0 + ): + self._update_module_kl_coeff(module_id=module_id, config=config) + + @classmethod + @override(Learner) + def rl_module_required_apis(cls) -> list[type]: + # In order for a PPOLearner to update an RLModule, it must implement the + # following APIs: + return [TargetNetworkAPI, ValueFunctionAPI] + + @abc.abstractmethod + def _update_module_kl_coeff(self, module_id: ModuleID, config: APPOConfig) -> None: + """Dynamically update the KL loss coefficients of each module. + + The update is completed using the mean KL divergence between the action + distributions current policy and old policy of each module. That action + distribution is computed during the most recent update/call to `compute_loss`. + + Args: + module_id: The module whose KL loss coefficient to update. + config: The AlgorithmConfig specific to the given `module_id`. + """ + + +AppoLearner = APPOLearner diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_rl_module.py new file mode 100644 index 0000000000000000000000000000000000000000..5a2f59f9f201bb2ca46e6136ec7e3882a30bd0bd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_rl_module.py @@ -0,0 +1,11 @@ +# Backward compat import. +from ray.rllib.algorithms.appo.default_appo_rl_module import ( # noqa + DefaultAPPORLModule as APPORLModule, +) +from ray.rllib.utils.deprecation import deprecation_warning + +deprecation_warning( + old="ray.rllib.algorithms.appo.appo_rl_module.APPORLModule", + new="ray.rllib.algorithms.appo.default_appo_rl_module.DefaultAPPORLModule", + error=False, +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_tf_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_tf_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..4af36f099df9216e42e7c9cae039e6a12bddef6f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_tf_policy.py @@ -0,0 +1,393 @@ +""" +TensorFlow policy class used for APPO. + +Adapted from VTraceTFPolicy to use the PPO surrogate loss. +Keep in sync with changes to VTraceTFPolicy. +""" + +import numpy as np +import logging +import gymnasium as gym +from typing import Dict, List, Optional, Type, Union + +from ray.rllib.algorithms.appo.utils import make_appo_models +from ray.rllib.algorithms.impala import vtrace_tf as vtrace +from ray.rllib.algorithms.impala.impala_tf_policy import ( + _make_time_major, + VTraceClipGradients, + VTraceOptimizer, +) +from ray.rllib.evaluation.postprocessing import ( + compute_bootstrap_value, + compute_gae_for_sample_batch, + Postprocessing, +) +from ray.rllib.models.tf.tf_action_dist import Categorical +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 +from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 +from ray.rllib.policy.tf_mixins import ( + EntropyCoeffSchedule, + LearningRateSchedule, + KLCoeffMixin, + ValueNetworkMixin, + GradStatsMixin, + TargetNetworkMixin, +) +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_action_dist import TFActionDistribution +from ray.rllib.utils.annotations import ( + override, +) +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.tf_utils import explained_variance +from ray.rllib.utils.typing import TensorType + +tf1, tf, tfv = try_import_tf() + +logger = logging.getLogger(__name__) + + +# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs. +def get_appo_tf_policy(name: str, base: type) -> type: + """Construct an APPOTFPolicy inheriting either dynamic or eager base policies. + + Args: + base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. + + Returns: + A TF Policy to be used with Impala. + """ + + class APPOTFPolicy( + VTraceClipGradients, + VTraceOptimizer, + LearningRateSchedule, + KLCoeffMixin, + EntropyCoeffSchedule, + ValueNetworkMixin, + TargetNetworkMixin, + GradStatsMixin, + base, + ): + def __init__( + self, + observation_space, + action_space, + config, + existing_model=None, + existing_inputs=None, + ): + # First thing first, enable eager execution if necessary. + base.enable_eager_execution_if_necessary() + + # Although this is a no-op, we call __init__ here to make it clear + # that base.__init__ will use the make_model() call. + VTraceClipGradients.__init__(self) + VTraceOptimizer.__init__(self) + + # Initialize base class. + base.__init__( + self, + observation_space, + action_space, + config, + existing_inputs=existing_inputs, + existing_model=existing_model, + ) + + # TF LearningRateSchedule depends on self.framework, so initialize + # after base.__init__() is called. + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + ValueNetworkMixin.__init__(self, config) + KLCoeffMixin.__init__(self, config) + + GradStatsMixin.__init__(self) + + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() + + # Initiate TargetNetwork ops after loss initialization. + TargetNetworkMixin.__init__(self) + + @override(base) + def make_model(self) -> ModelV2: + return make_appo_models(self) + + @override(base) + def loss( + self, + model: Union[ModelV2, "tf.keras.Model"], + dist_class: Type[TFActionDistribution], + train_batch: SampleBatch, + ) -> Union[TensorType, List[TensorType]]: + model_out, _ = model(train_batch) + action_dist = dist_class(model_out, model) + + if isinstance(self.action_space, gym.spaces.Discrete): + is_multidiscrete = False + output_hidden_shape = [self.action_space.n] + elif isinstance(self.action_space, gym.spaces.multi_discrete.MultiDiscrete): + is_multidiscrete = True + output_hidden_shape = self.action_space.nvec.astype(np.int32) + else: + is_multidiscrete = False + output_hidden_shape = 1 + + def make_time_major(*args, **kw): + return _make_time_major( + self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw + ) + + actions = train_batch[SampleBatch.ACTIONS] + dones = train_batch[SampleBatch.TERMINATEDS] + rewards = train_batch[SampleBatch.REWARDS] + behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS] + + target_model_out, _ = self.target_model(train_batch) + prev_action_dist = dist_class(behaviour_logits, self.model) + values = self.model.value_function() + values_time_major = make_time_major(values) + bootstrap_values_time_major = make_time_major( + train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + bootstrap_value = bootstrap_values_time_major[-1] + + if self.is_recurrent(): + max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) + mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) + mask = tf.reshape(mask, [-1]) + mask = make_time_major(mask) + + def reduce_mean_valid(t): + return tf.reduce_mean(tf.boolean_mask(t, mask)) + + else: + reduce_mean_valid = tf.reduce_mean + + if self.config["vtrace"]: + logger.debug("Using V-Trace surrogate loss (vtrace=True)") + + # Prepare actions for loss. + loss_actions = ( + actions if is_multidiscrete else tf.expand_dims(actions, axis=1) + ) + + old_policy_behaviour_logits = tf.stop_gradient(target_model_out) + old_policy_action_dist = dist_class(old_policy_behaviour_logits, model) + + # Prepare KL for Loss + mean_kl = make_time_major(old_policy_action_dist.multi_kl(action_dist)) + + unpacked_behaviour_logits = tf.split( + behaviour_logits, output_hidden_shape, axis=1 + ) + unpacked_old_policy_behaviour_logits = tf.split( + old_policy_behaviour_logits, output_hidden_shape, axis=1 + ) + + # Compute vtrace on the CPU for better perf. + with tf.device("/cpu:0"): + vtrace_returns = vtrace.multi_from_logits( + behaviour_policy_logits=make_time_major( + unpacked_behaviour_logits + ), + target_policy_logits=make_time_major( + unpacked_old_policy_behaviour_logits + ), + actions=tf.unstack(make_time_major(loss_actions), axis=2), + discounts=tf.cast( + ~make_time_major(tf.cast(dones, tf.bool)), + tf.float32, + ) + * self.config["gamma"], + rewards=make_time_major(rewards), + values=values_time_major, + bootstrap_value=bootstrap_value, + dist_class=Categorical if is_multidiscrete else dist_class, + model=model, + clip_rho_threshold=tf.cast( + self.config["vtrace_clip_rho_threshold"], tf.float32 + ), + clip_pg_rho_threshold=tf.cast( + self.config["vtrace_clip_pg_rho_threshold"], tf.float32 + ), + ) + + actions_logp = make_time_major(action_dist.logp(actions)) + prev_actions_logp = make_time_major(prev_action_dist.logp(actions)) + old_policy_actions_logp = make_time_major( + old_policy_action_dist.logp(actions) + ) + + is_ratio = tf.clip_by_value( + tf.math.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0 + ) + logp_ratio = is_ratio * tf.exp(actions_logp - prev_actions_logp) + self._is_ratio = is_ratio + + advantages = vtrace_returns.pg_advantages + surrogate_loss = tf.minimum( + advantages * logp_ratio, + advantages + * tf.clip_by_value( + logp_ratio, + 1 - self.config["clip_param"], + 1 + self.config["clip_param"], + ), + ) + + action_kl = ( + tf.reduce_mean(mean_kl, axis=0) if is_multidiscrete else mean_kl + ) + mean_kl_loss = reduce_mean_valid(action_kl) + mean_policy_loss = -reduce_mean_valid(surrogate_loss) + + # The value function loss. + value_targets = vtrace_returns.vs + delta = values_time_major - value_targets + mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta)) + + # The entropy loss. + actions_entropy = make_time_major(action_dist.multi_entropy()) + mean_entropy = reduce_mean_valid(actions_entropy) + + else: + logger.debug("Using PPO surrogate loss (vtrace=False)") + + # Prepare KL for Loss + mean_kl = make_time_major(prev_action_dist.multi_kl(action_dist)) + + logp_ratio = tf.math.exp( + make_time_major(action_dist.logp(actions)) + - make_time_major(prev_action_dist.logp(actions)) + ) + + advantages = make_time_major(train_batch[Postprocessing.ADVANTAGES]) + surrogate_loss = tf.minimum( + advantages * logp_ratio, + advantages + * tf.clip_by_value( + logp_ratio, + 1 - self.config["clip_param"], + 1 + self.config["clip_param"], + ), + ) + + action_kl = ( + tf.reduce_mean(mean_kl, axis=0) if is_multidiscrete else mean_kl + ) + mean_kl_loss = reduce_mean_valid(action_kl) + mean_policy_loss = -reduce_mean_valid(surrogate_loss) + + # The value function loss. + value_targets = make_time_major( + train_batch[Postprocessing.VALUE_TARGETS] + ) + delta = values_time_major - value_targets + mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta)) + + # The entropy loss. + mean_entropy = reduce_mean_valid( + make_time_major(action_dist.multi_entropy()) + ) + + # The summed weighted loss. + total_loss = mean_policy_loss - mean_entropy * self.entropy_coeff + # Optional KL loss. + if self.config["use_kl_loss"]: + total_loss += self.kl_coeff * mean_kl_loss + # Optional vf loss (or in a separate term due to separate + # optimizers/networks). + loss_wo_vf = total_loss + if not self.config["_separate_vf_optimizer"]: + total_loss += mean_vf_loss * self.config["vf_loss_coeff"] + + # Store stats in policy for stats_fn. + self._total_loss = total_loss + self._loss_wo_vf = loss_wo_vf + self._mean_policy_loss = mean_policy_loss + # Backward compatibility: Deprecate policy._mean_kl. + self._mean_kl_loss = self._mean_kl = mean_kl_loss + self._mean_vf_loss = mean_vf_loss + self._mean_entropy = mean_entropy + self._value_targets = value_targets + + # Return one total loss or two losses: vf vs rest (policy + kl). + if self.config["_separate_vf_optimizer"]: + return loss_wo_vf, mean_vf_loss + else: + return total_loss + + @override(base) + def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: + values_batched = _make_time_major( + self, + train_batch.get(SampleBatch.SEQ_LENS), + self.model.value_function(), + ) + + stats_dict = { + "cur_lr": tf.cast(self.cur_lr, tf.float64), + "total_loss": self._total_loss, + "policy_loss": self._mean_policy_loss, + "entropy": self._mean_entropy, + "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables()), + "vf_loss": self._mean_vf_loss, + "vf_explained_var": explained_variance( + tf.reshape(self._value_targets, [-1]), + tf.reshape(values_batched, [-1]), + ), + "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), + } + + if self.config["vtrace"]: + is_stat_mean, is_stat_var = tf.nn.moments(self._is_ratio, [0, 1]) + stats_dict["mean_IS"] = is_stat_mean + stats_dict["var_IS"] = is_stat_var + + if self.config["use_kl_loss"]: + stats_dict["kl"] = self._mean_kl_loss + stats_dict["KL_Coeff"] = self.kl_coeff + + return stats_dict + + @override(base) + def postprocess_trajectory( + self, + sample_batch: SampleBatch, + other_agent_batches: Optional[SampleBatch] = None, + episode=None, + ): + # Call super's postprocess_trajectory first. + # sample_batch = super().postprocess_trajectory( + # sample_batch, other_agent_batches, episode + # ) + + if not self.config["vtrace"]: + sample_batch = compute_gae_for_sample_batch( + self, sample_batch, other_agent_batches, episode + ) + else: + # Add the Columns.VALUES_BOOTSTRAPPED column, which we'll need + # inside the loss for vtrace calculations. + sample_batch = compute_bootstrap_value(sample_batch, self) + + return sample_batch + + @override(base) + def get_batch_divisibility_req(self) -> int: + return self.config["rollout_fragment_length"] + + APPOTFPolicy.__name__ = name + APPOTFPolicy.__qualname__ = name + + return APPOTFPolicy + + +APPOTF1Policy = get_appo_tf_policy("APPOTF1Policy", DynamicTFPolicyV2) +APPOTF2Policy = get_appo_tf_policy("APPOTF2Policy", EagerTFPolicyV2) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_torch_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_torch_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..1d28138c8c25d066fd3f701fae8c5493c2cc55d9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/appo_torch_policy.py @@ -0,0 +1,412 @@ +""" +PyTorch policy class used for APPO. + +Adapted from VTraceTFPolicy to use the PPO surrogate loss. +Keep in sync with changes to VTraceTFPolicy. +""" + +import gymnasium as gym +import numpy as np +import logging +from typing import Any, Dict, List, Optional, Type, Union + +import ray +from ray.rllib.algorithms.appo.utils import make_appo_models +import ray.rllib.algorithms.impala.vtrace_torch as vtrace +from ray.rllib.algorithms.impala.impala_torch_policy import ( + make_time_major, + VTraceOptimizer, +) +from ray.rllib.evaluation.postprocessing import ( + compute_bootstrap_value, + compute_gae_for_sample_batch, + Postprocessing, +) +from ray.rllib.models.action_dist import ActionDistribution +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.torch.torch_action_dist import ( + TorchDistributionWrapper, + TorchCategorical, +) +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.torch_mixins import ( + EntropyCoeffSchedule, + LearningRateSchedule, + KLCoeffMixin, + ValueNetworkMixin, + TargetNetworkMixin, +) +from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.torch_utils import ( + apply_grad_clipping, + explained_variance, + global_norm, + sequence_mask, +) +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + +logger = logging.getLogger(__name__) + + +# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs. +class APPOTorchPolicy( + VTraceOptimizer, + LearningRateSchedule, + EntropyCoeffSchedule, + KLCoeffMixin, + ValueNetworkMixin, + TargetNetworkMixin, + TorchPolicyV2, +): + """PyTorch policy class used with APPO.""" + + def __init__(self, observation_space, action_space, config): + config = dict(ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config) + config["enable_rl_module_and_learner"] = False + config["enable_env_runner_and_connector_v2"] = False + + # Although this is a no-op, we call __init__ here to make it clear + # that base.__init__ will use the make_model() call. + VTraceOptimizer.__init__(self) + + lr_schedule_additional_args = [] + if config.get("_separate_vf_optimizer"): + lr_schedule_additional_args = ( + [config["_lr_vf"][0][1], config["_lr_vf"]] + if isinstance(config["_lr_vf"], (list, tuple)) + else [config["_lr_vf"], None] + ) + LearningRateSchedule.__init__( + self, config["lr"], config["lr_schedule"], *lr_schedule_additional_args + ) + + TorchPolicyV2.__init__( + self, + observation_space, + action_space, + config, + max_seq_len=config["model"]["max_seq_len"], + ) + + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + ValueNetworkMixin.__init__(self, config) + KLCoeffMixin.__init__(self, config) + + self._initialize_loss_from_dummy_batch() + + # Initiate TargetNetwork ops after loss initialization. + TargetNetworkMixin.__init__(self) + + @override(TorchPolicyV2) + def init_view_requirements(self): + self.view_requirements = self._get_default_view_requirements() + + @override(TorchPolicyV2) + def make_model(self) -> ModelV2: + return make_appo_models(self) + + @override(TorchPolicyV2) + def loss( + self, + model: ModelV2, + dist_class: Type[ActionDistribution], + train_batch: SampleBatch, + ) -> Union[TensorType, List[TensorType]]: + """Constructs the loss for APPO. + + With IS modifications and V-trace for Advantage Estimation. + + Args: + model (ModelV2): The Model to calculate the loss for. + dist_class (Type[ActionDistribution]): The action distr. class. + train_batch: The training data. + + Returns: + Union[TensorType, List[TensorType]]: A single loss tensor or a list + of loss tensors. + """ + target_model = self.target_models[model] + + model_out, _ = model(train_batch) + action_dist = dist_class(model_out, model) + + if isinstance(self.action_space, gym.spaces.Discrete): + is_multidiscrete = False + output_hidden_shape = [self.action_space.n] + elif isinstance(self.action_space, gym.spaces.multi_discrete.MultiDiscrete): + is_multidiscrete = True + output_hidden_shape = self.action_space.nvec.astype(np.int32) + else: + is_multidiscrete = False + output_hidden_shape = 1 + + def _make_time_major(*args, **kwargs): + return make_time_major( + self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kwargs + ) + + actions = train_batch[SampleBatch.ACTIONS] + dones = train_batch[SampleBatch.TERMINATEDS] + rewards = train_batch[SampleBatch.REWARDS] + behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS] + + target_model_out, _ = target_model(train_batch) + + prev_action_dist = dist_class(behaviour_logits, model) + values = model.value_function() + values_time_major = _make_time_major(values) + bootstrap_values_time_major = _make_time_major( + train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + bootstrap_value = bootstrap_values_time_major[-1] + + if self.is_recurrent(): + max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS]) + mask = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) + mask = torch.reshape(mask, [-1]) + mask = _make_time_major(mask) + num_valid = torch.sum(mask) + + def reduce_mean_valid(t): + return torch.sum(t[mask]) / num_valid + + else: + reduce_mean_valid = torch.mean + + if self.config["vtrace"]: + logger.debug("Using V-Trace surrogate loss (vtrace=True)") + + old_policy_behaviour_logits = target_model_out.detach() + old_policy_action_dist = dist_class(old_policy_behaviour_logits, model) + + if isinstance(output_hidden_shape, (list, tuple, np.ndarray)): + unpacked_behaviour_logits = torch.split( + behaviour_logits, list(output_hidden_shape), dim=1 + ) + unpacked_old_policy_behaviour_logits = torch.split( + old_policy_behaviour_logits, list(output_hidden_shape), dim=1 + ) + else: + unpacked_behaviour_logits = torch.chunk( + behaviour_logits, output_hidden_shape, dim=1 + ) + unpacked_old_policy_behaviour_logits = torch.chunk( + old_policy_behaviour_logits, output_hidden_shape, dim=1 + ) + + # Prepare actions for loss. + loss_actions = ( + actions if is_multidiscrete else torch.unsqueeze(actions, dim=1) + ) + + # Prepare KL for loss. + action_kl = _make_time_major(old_policy_action_dist.kl(action_dist)) + + # Compute vtrace on the CPU for better perf. + vtrace_returns = vtrace.multi_from_logits( + behaviour_policy_logits=_make_time_major(unpacked_behaviour_logits), + target_policy_logits=_make_time_major( + unpacked_old_policy_behaviour_logits + ), + actions=torch.unbind(_make_time_major(loss_actions), dim=2), + discounts=(1.0 - _make_time_major(dones).float()) + * self.config["gamma"], + rewards=_make_time_major(rewards), + values=values_time_major, + bootstrap_value=bootstrap_value, + dist_class=TorchCategorical if is_multidiscrete else dist_class, + model=model, + clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], + clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], + ) + + actions_logp = _make_time_major(action_dist.logp(actions)) + prev_actions_logp = _make_time_major(prev_action_dist.logp(actions)) + old_policy_actions_logp = _make_time_major( + old_policy_action_dist.logp(actions) + ) + is_ratio = torch.clamp( + torch.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0 + ) + logp_ratio = is_ratio * torch.exp(actions_logp - prev_actions_logp) + self._is_ratio = is_ratio + + advantages = vtrace_returns.pg_advantages.to(logp_ratio.device) + surrogate_loss = torch.min( + advantages * logp_ratio, + advantages + * torch.clamp( + logp_ratio, + 1 - self.config["clip_param"], + 1 + self.config["clip_param"], + ), + ) + + mean_kl_loss = reduce_mean_valid(action_kl) + mean_policy_loss = -reduce_mean_valid(surrogate_loss) + + # The value function loss. + value_targets = vtrace_returns.vs.to(values_time_major.device) + delta = values_time_major - value_targets + mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0)) + + # The entropy loss. + mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy())) + + else: + logger.debug("Using PPO surrogate loss (vtrace=False)") + + # Prepare KL for Loss + action_kl = _make_time_major(prev_action_dist.kl(action_dist)) + + actions_logp = _make_time_major(action_dist.logp(actions)) + prev_actions_logp = _make_time_major(prev_action_dist.logp(actions)) + logp_ratio = torch.exp(actions_logp - prev_actions_logp) + + advantages = _make_time_major(train_batch[Postprocessing.ADVANTAGES]) + surrogate_loss = torch.min( + advantages * logp_ratio, + advantages + * torch.clamp( + logp_ratio, + 1 - self.config["clip_param"], + 1 + self.config["clip_param"], + ), + ) + + mean_kl_loss = reduce_mean_valid(action_kl) + mean_policy_loss = -reduce_mean_valid(surrogate_loss) + + # The value function loss. + value_targets = _make_time_major(train_batch[Postprocessing.VALUE_TARGETS]) + delta = values_time_major - value_targets + mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0)) + + # The entropy loss. + mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy())) + + # The summed weighted loss. + total_loss = mean_policy_loss - mean_entropy * self.entropy_coeff + # Optional additional KL Loss + if self.config["use_kl_loss"]: + total_loss += self.kl_coeff * mean_kl_loss + + # Optional vf loss (or in a separate term due to separate + # optimizers/networks). + loss_wo_vf = total_loss + if not self.config["_separate_vf_optimizer"]: + total_loss += mean_vf_loss * self.config["vf_loss_coeff"] + + # Store values for stats function in model (tower), such that for + # multi-GPU, we do not override them during the parallel loss phase. + model.tower_stats["total_loss"] = total_loss + model.tower_stats["mean_policy_loss"] = mean_policy_loss + model.tower_stats["mean_kl_loss"] = mean_kl_loss + model.tower_stats["mean_vf_loss"] = mean_vf_loss + model.tower_stats["mean_entropy"] = mean_entropy + model.tower_stats["value_targets"] = value_targets + model.tower_stats["vf_explained_var"] = explained_variance( + torch.reshape(value_targets, [-1]), + torch.reshape(values_time_major, [-1]), + ) + + # Return one total loss or two losses: vf vs rest (policy + kl). + if self.config["_separate_vf_optimizer"]: + return loss_wo_vf, mean_vf_loss + else: + return total_loss + + @override(TorchPolicyV2) + def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: + """Stats function for APPO. Returns a dict with important loss stats. + + Args: + policy: The Policy to generate stats for. + train_batch: The SampleBatch (already) used for training. + + Returns: + Dict[str, TensorType]: The stats dict. + """ + stats_dict = { + "cur_lr": self.cur_lr, + "total_loss": torch.mean(torch.stack(self.get_tower_stats("total_loss"))), + "policy_loss": torch.mean( + torch.stack(self.get_tower_stats("mean_policy_loss")) + ), + "entropy": torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))), + "entropy_coeff": self.entropy_coeff, + "var_gnorm": global_norm(self.model.trainable_variables()), + "vf_loss": torch.mean(torch.stack(self.get_tower_stats("mean_vf_loss"))), + "vf_explained_var": torch.mean( + torch.stack(self.get_tower_stats("vf_explained_var")) + ), + } + + if self.config["vtrace"]: + is_stat_mean = torch.mean(self._is_ratio, [0, 1]) + is_stat_var = torch.var(self._is_ratio, [0, 1]) + stats_dict["mean_IS"] = is_stat_mean + stats_dict["var_IS"] = is_stat_var + + if self.config["use_kl_loss"]: + stats_dict["kl"] = torch.mean( + torch.stack(self.get_tower_stats("mean_kl_loss")) + ) + stats_dict["KL_Coeff"] = self.kl_coeff + + return convert_to_numpy(stats_dict) + + @override(TorchPolicyV2) + def extra_action_out( + self, + input_dict: Dict[str, TensorType], + state_batches: List[TensorType], + model: TorchModelV2, + action_dist: TorchDistributionWrapper, + ) -> Dict[str, TensorType]: + return {SampleBatch.VF_PREDS: model.value_function()} + + @override(TorchPolicyV2) + def postprocess_trajectory( + self, + sample_batch: SampleBatch, + other_agent_batches: Optional[Dict[Any, SampleBatch]] = None, + episode=None, + ): + # Call super's postprocess_trajectory first. + # sample_batch = super().postprocess_trajectory( + # sample_batch, other_agent_batches, episode + # ) + + # Do all post-processing always with no_grad(). + # Not using this here will introduce a memory leak + # in torch (issue #6962). + with torch.no_grad(): + if not self.config["vtrace"]: + sample_batch = compute_gae_for_sample_batch( + self, sample_batch, other_agent_batches, episode + ) + else: + # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need + # inside the loss for vtrace calculations. + sample_batch = compute_bootstrap_value(sample_batch, self) + + return sample_batch + + @override(TorchPolicyV2) + def extra_grad_process( + self, optimizer: "torch.optim.Optimizer", loss: TensorType + ) -> Dict[str, TensorType]: + return apply_grad_clipping(self, optimizer, loss) + + @override(TorchPolicyV2) + def get_batch_divisibility_req(self) -> int: + return self.config["rollout_fragment_length"] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/default_appo_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/default_appo_rl_module.py new file mode 100644 index 0000000000000000000000000000000000000000..ec9bfa5ab0a36b5908826953b2d6010401a0541a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/default_appo_rl_module.py @@ -0,0 +1,59 @@ +import abc +from typing import Any, Dict, List, Tuple + +from ray.rllib.algorithms.ppo.default_ppo_rl_module import DefaultPPORLModule +from ray.rllib.core.learner.utils import make_target_network +from ray.rllib.core.models.base import ACTOR +from ray.rllib.core.models.tf.encoder import ENCODER_OUT +from ray.rllib.core.rl_module.apis import ( + TARGET_NETWORK_ACTION_DIST_INPUTS, + TargetNetworkAPI, +) +from ray.rllib.utils.typing import NetworkType + +from ray.rllib.utils.annotations import ( + override, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +class DefaultAPPORLModule(DefaultPPORLModule, TargetNetworkAPI, abc.ABC): + """Default RLModule used by APPO, if user does not specify a custom RLModule. + + Users who want to train their RLModules with APPO may implement any RLModule + (or TorchRLModule) subclass as long as the custom class also implements the + `ValueFunctionAPI` (see ray.rllib.core.rl_module.apis.value_function_api.py) + and the `TargetNetworkAPI` (see + ray.rllib.core.rl_module.apis.target_network_api.py). + """ + + @override(TargetNetworkAPI) + def make_target_networks(self): + self._old_encoder = make_target_network(self.encoder) + self._old_pi = make_target_network(self.pi) + + @override(TargetNetworkAPI) + def get_target_network_pairs(self) -> List[Tuple[NetworkType, NetworkType]]: + return [ + (self.encoder, self._old_encoder), + (self.pi, self._old_pi), + ] + + @override(TargetNetworkAPI) + def forward_target(self, batch: Dict[str, Any]) -> Dict[str, Any]: + old_pi_inputs_encoded = self._old_encoder(batch)[ENCODER_OUT][ACTOR] + old_action_dist_logits = self._old_pi(old_pi_inputs_encoded) + return {TARGET_NETWORK_ACTION_DIST_INPUTS: old_action_dist_logits} + + @OverrideToImplementCustomLogic_CallToSuperRecommended + @override(DefaultPPORLModule) + def get_non_inference_attributes(self) -> List[str]: + # Get the NON inference-only attributes from the parent class + # `PPOTorchRLModule`. + ret = super().get_non_inference_attributes() + # Add the two (APPO) target networks to it (NOT needed in + # inference-only mode). + ret += ["_old_encoder", "_old_pi"] + return ret diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1e025a80609d8bff4fed2839677ae85e39b88f2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cc24c6d9c3841fd7cc2be830470e4be2f45f806 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_rl_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f096b901f806cae8f32dac1c89ffb4bb36a9ea2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/appo_torch_rl_module.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/default_appo_torch_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/default_appo_torch_rl_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50bd57b116861cafb917e2b583ee5eeed1f69d54 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/__pycache__/default_appo_torch_rl_module.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_learner.py new file mode 100644 index 0000000000000000000000000000000000000000..62a4198952ecd3ee08947f57b2f5d18d9bd20232 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -0,0 +1,234 @@ +"""Asynchronous Proximal Policy Optimization (APPO) + +The algorithm is described in [1] (under the name of "IMPACT"): + +Detailed documentation: +https://docs.ray.io/en/master/rllib-algorithms.html#appo + +[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks. +Luo et al. 2020 +https://arxiv.org/pdf/1912.00167 +""" +from typing import Dict + +from ray.rllib.algorithms.appo.appo import ( + APPOConfig, + LEARNER_RESULTS_CURR_KL_COEFF_KEY, + LEARNER_RESULTS_KL_KEY, +) +from ray.rllib.algorithms.appo.appo_learner import APPOLearner +from ray.rllib.algorithms.impala.torch.impala_torch_learner import IMPALATorchLearner +from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import ( + make_time_major, + vtrace_torch, +) +from ray.rllib.core.columns import Columns +from ray.rllib.core.learner.learner import POLICY_LOSS_KEY, VF_LOSS_KEY, ENTROPY_KEY +from ray.rllib.core.rl_module.apis import ( + TARGET_NETWORK_ACTION_DIST_INPUTS, + TargetNetworkAPI, + ValueFunctionAPI, +) +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.typing import ModuleID, TensorType + +torch, nn = try_import_torch() + + +class APPOTorchLearner(APPOLearner, IMPALATorchLearner): + """Implements APPO loss / update logic on top of IMPALATorchLearner.""" + + @override(IMPALATorchLearner) + def compute_loss_for_module( + self, + *, + module_id: ModuleID, + config: APPOConfig, + batch: Dict, + fwd_out: Dict[str, TensorType], + ) -> TensorType: + module = self.module[module_id].unwrapped() + assert isinstance(module, TargetNetworkAPI) + assert isinstance(module, ValueFunctionAPI) + + # TODO (sven): Now that we do the +1ts trick to be less vulnerable about + # bootstrap values at the end of rollouts in the new stack, we might make + # this a more flexible, configurable parameter for users, e.g. + # `v_trace_seq_len` (independent of `rollout_fragment_length`). Separation + # of concerns (sampling vs learning). + rollout_frag_or_episode_len = config.get_rollout_fragment_length() + recurrent_seq_len = batch.get("seq_lens") + + loss_mask = batch[Columns.LOSS_MASK].float() + loss_mask_time_major = make_time_major( + loss_mask, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + size_loss_mask = torch.sum(loss_mask) + + values = module.compute_values( + batch, embeddings=fwd_out.get(Columns.EMBEDDINGS) + ) + + action_dist_cls_train = module.get_train_action_dist_cls() + target_policy_dist = action_dist_cls_train.from_logits( + fwd_out[Columns.ACTION_DIST_INPUTS] + ) + + old_target_policy_dist = action_dist_cls_train.from_logits( + module.forward_target(batch)[TARGET_NETWORK_ACTION_DIST_INPUTS] + ) + old_target_policy_actions_logp = old_target_policy_dist.logp( + batch[Columns.ACTIONS] + ) + behaviour_actions_logp = batch[Columns.ACTION_LOGP] + target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS]) + + behaviour_actions_logp_time_major = make_time_major( + behaviour_actions_logp, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + target_actions_logp_time_major = make_time_major( + target_actions_logp, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + old_actions_logp_time_major = make_time_major( + old_target_policy_actions_logp, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + rewards_time_major = make_time_major( + batch[Columns.REWARDS], + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + values_time_major = make_time_major( + values, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + assert Columns.VALUES_BOOTSTRAPPED not in batch + # Use as bootstrap values the vf-preds in the next "batch row", except + # for the very last row (which doesn't have a next row), for which the + # bootstrap value does not matter b/c it has a +1ts value at its end + # anyways. So we chose an arbitrary item (for simplicity of not having to + # move new data to the device). + bootstrap_values = torch.cat( + [ + values_time_major[0][1:], # 0th ts values from "next row" + values_time_major[0][0:1], # <- can use any arbitrary value here + ], + dim=0, + ) + + # The discount factor that is used should be gamma except for timesteps where + # the episode is terminated. In that case, the discount factor should be 0. + discounts_time_major = ( + 1.0 + - make_time_major( + batch[Columns.TERMINATEDS], + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ).float() + ) * config.gamma + + # Note that vtrace will compute the main loop on the CPU for better performance. + vtrace_adjusted_target_values, pg_advantages = vtrace_torch( + target_action_log_probs=old_actions_logp_time_major, + behaviour_action_log_probs=behaviour_actions_logp_time_major, + discounts=discounts_time_major, + rewards=rewards_time_major, + values=values_time_major, + bootstrap_values=bootstrap_values, + clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold, + clip_rho_threshold=config.vtrace_clip_rho_threshold, + ) + pg_advantages = pg_advantages * loss_mask_time_major + + # The policy gradients loss. + is_ratio = torch.clip( + torch.exp(behaviour_actions_logp_time_major - old_actions_logp_time_major), + 0.0, + 2.0, + ) + logp_ratio = is_ratio * torch.exp( + target_actions_logp_time_major - behaviour_actions_logp_time_major + ) + + surrogate_loss = torch.minimum( + pg_advantages * logp_ratio, + pg_advantages + * torch.clip(logp_ratio, 1 - config.clip_param, 1 + config.clip_param), + ) + + if config.use_kl_loss: + action_kl = old_target_policy_dist.kl(target_policy_dist) * loss_mask + mean_kl_loss = torch.sum(action_kl) / size_loss_mask + else: + mean_kl_loss = 0.0 + mean_pi_loss = -(torch.sum(surrogate_loss) / size_loss_mask) + + # The baseline loss. + delta = values_time_major - vtrace_adjusted_target_values + vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major) + mean_vf_loss = vf_loss / size_loss_mask + + # The entropy loss. + mean_entropy_loss = ( + -torch.sum(target_policy_dist.entropy() * loss_mask) / size_loss_mask + ) + + # The summed weighted loss. + total_loss = ( + mean_pi_loss + + (mean_vf_loss * config.vf_loss_coeff) + + ( + mean_entropy_loss + * self.entropy_coeff_schedulers_per_module[ + module_id + ].get_current_value() + ) + + (mean_kl_loss * self.curr_kl_coeffs_per_module[module_id]) + ) + + # Log important loss stats. + self.metrics.log_dict( + { + POLICY_LOSS_KEY: mean_pi_loss, + VF_LOSS_KEY: mean_vf_loss, + ENTROPY_KEY: -mean_entropy_loss, + LEARNER_RESULTS_KL_KEY: mean_kl_loss, + LEARNER_RESULTS_CURR_KL_COEFF_KEY: ( + self.curr_kl_coeffs_per_module[module_id] + ), + }, + key=module_id, + window=1, # <- single items (should not be mean/ema-reduced over time). + ) + # Return the total loss. + return total_loss + + @override(APPOLearner) + def _update_module_kl_coeff(self, module_id: ModuleID, config: APPOConfig) -> None: + # Update the current KL value based on the recently measured value. + # Increase. + kl = convert_to_numpy(self.metrics.peek((module_id, LEARNER_RESULTS_KL_KEY))) + kl_coeff_var = self.curr_kl_coeffs_per_module[module_id] + + if kl > 2.0 * config.kl_target: + # TODO (Kourosh) why not *2.0? + kl_coeff_var.data *= 1.5 + # Decrease. + elif kl < 0.5 * config.kl_target: + kl_coeff_var.data *= 0.5 + + self.metrics.log_value( + (module_id, LEARNER_RESULTS_CURR_KL_COEFF_KEY), + kl_coeff_var.item(), + window=1, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_rl_module.py new file mode 100644 index 0000000000000000000000000000000000000000..ae60657b2c95ea55f70bd079bad0d9b9a6d4001f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/appo_torch_rl_module.py @@ -0,0 +1,13 @@ +# Backward compat import. +from ray.rllib.algorithms.appo.torch.default_appo_torch_rl_module import ( # noqa + DefaultAPPOTorchRLModule as APPOTorchRLModule, +) +from ray.rllib.utils.deprecation import deprecation_warning + + +deprecation_warning( + old="ray.rllib.algorithms.appo.torch.appo_torch_rl_module.APPOTorchRLModule", + new="ray.rllib.algorithms.appo.torch.default_appo_torch_rl_module." + "DefaultAPPOTorchRLModule", + error=False, +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/default_appo_torch_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/default_appo_torch_rl_module.py new file mode 100644 index 0000000000000000000000000000000000000000..637c3de89d29ed98a3460a86c058ee20fac6b813 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/torch/default_appo_torch_rl_module.py @@ -0,0 +1,10 @@ +from ray.rllib.algorithms.appo.default_appo_rl_module import DefaultAPPORLModule +from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import ( + DefaultPPOTorchRLModule, +) +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +class DefaultAPPOTorchRLModule(DefaultPPOTorchRLModule, DefaultAPPORLModule): + pass diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9a4f1e66d0a917bfe7de64e5fbb820c6b2cf2aa7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/appo/utils.py @@ -0,0 +1,133 @@ +""" +[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks. +Luo et al. 2020 +https://arxiv.org/pdf/1912.00167 +""" +from collections import deque +import random +import threading +import time + +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.utils.annotations import OldAPIStack + + +POLICY_SCOPE = "func" +TARGET_POLICY_SCOPE = "target_func" + + +class CircularBuffer: + """A circular batch-wise buffer as described in [1] for APPO. + + The buffer holds at most N batches, which are sampled at random (uniformly). + If full and a new batch is added, the oldest batch is discarded. Also, each batch + currently in the buffer can be sampled at most K times (after which it is also + discarded). + """ + + def __init__(self, num_batches: int, iterations_per_batch: int): + # N from the paper (buffer size). + self.num_batches = num_batches + # K ("replay coefficient") from the paper. + self.iterations_per_batch = iterations_per_batch + + self._buffer = deque(maxlen=self.num_batches) + self._lock = threading.Lock() + + # The number of valid (not expired) entries in this buffer. + self._num_valid_batches = 0 + + def add(self, batch): + dropped_entry = None + dropped_ts = 0 + + # Add buffer and k=0 information to the deque. + with self._lock: + len_ = len(self._buffer) + if len_ == self.num_batches: + dropped_entry = self._buffer[0] + self._buffer.append([batch, 0]) + self._num_valid_batches += 1 + + # A valid entry (w/ a batch whose k has not been reach K yet) was dropped. + if dropped_entry is not None and dropped_entry[0] is not None: + dropped_ts += dropped_entry[0].env_steps() * ( + self.iterations_per_batch - dropped_entry[1] + ) + self._num_valid_batches -= 1 + + return dropped_ts + + def sample(self): + k = entry = batch = None + + while True: + # Only initially, the buffer may be empty -> Just wait for some time. + if len(self) == 0: + time.sleep(0.001) + continue + # Sample a random buffer index. + with self._lock: + entry = self._buffer[random.randint(0, len(self._buffer) - 1)] + batch, k = entry + # Ignore batches that have already been invalidated. + if batch is not None: + break + + # Increase k += 1 for this batch. + assert k is not None + entry[1] += 1 + + # This batch has been exhausted (k == K) -> Invalidate it in the buffer. + if k == self.iterations_per_batch - 1: + entry[0] = None + entry[1] = None + self._num_valid_batches += 1 + + # Return the sampled batch. + return batch + + def __len__(self) -> int: + """Returns the number of actually valid (non-expired) batches in the buffer.""" + return self._num_valid_batches + + +@OldAPIStack +def make_appo_models(policy) -> ModelV2: + """Builds model and target model for APPO. + + Returns: + ModelV2: The Model for the Policy to use. + Note: The target model will not be returned, just assigned to + `policy.target_model`. + """ + # Get the num_outputs for the following model construction calls. + _, logit_dim = ModelCatalog.get_action_dist( + policy.action_space, policy.config["model"] + ) + + # Construct the (main) model. + policy.model = ModelCatalog.get_model_v2( + policy.observation_space, + policy.action_space, + logit_dim, + policy.config["model"], + name=POLICY_SCOPE, + framework=policy.framework, + ) + policy.model_variables = policy.model.variables() + + # Construct the target model. + policy.target_model = ModelCatalog.get_model_v2( + policy.observation_space, + policy.action_space, + logit_dim, + policy.config["model"], + name=TARGET_POLICY_SCOPE, + framework=policy.framework, + ) + policy.target_model_variables = policy.target_model.variables() + + # Return only the model (not the target model). + return policy.model diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db5a0179a8a748d588ca27bc66cb55f50c8bd08d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68cbdbd64d1308fe10f1f063014075998fe895cc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_catalog.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_catalog.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..079c5c126cf71aa5a06aef003e98a2af6d040709 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_catalog.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2b8a5ad22b5e8bea5e2fab3e44b0eba9060e722 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_rl_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c91f1e4dc3279bf2b480d86ef4e8e633135ac8c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__pycache__/dreamerv3_rl_module.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba9e0c8d54b2804686807d3be42c62133ca5ed8c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/actor_network.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/actor_network.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1c3e3e919e8aa655c6f3685454476b8ab299eb7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/actor_network.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/critic_network.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/critic_network.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee5884634e4d2c4c187a4f34e8aa9202926f487b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/critic_network.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/disagree_networks.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/disagree_networks.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..811f8b0071bdf62dae8fd1410049032b58aff8b9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/disagree_networks.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/dreamer_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/dreamer_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea5effa280b73ce346c0aaeda43caea6db343d89 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/dreamer_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/world_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/world_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d113329fa719fb91d3ec22c1eb24f30471faa65a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__pycache__/world_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..537d0c493d2b07e4cb85c7dd7ee1aaccb96572e0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/cnn_atari.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/cnn_atari.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da4d5c20cc76a49429b6677344efa919129d7b6b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/cnn_atari.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/continue_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/continue_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab985a244f2f365bb4170bc72856ac8434f56608 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/continue_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/conv_transpose_atari.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/conv_transpose_atari.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0defad84e5a502bb2381f5c6dd6e0717b293961 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/conv_transpose_atari.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/dynamics_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/dynamics_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e66ed73fb3301f87d29a69b49d698022540983d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/dynamics_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/mlp.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/mlp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9ab1b129c590bbc7bc9f421bc1373d3c42f09c6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/mlp.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/representation_layer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/representation_layer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..906d5cf8a0f720a330b484f1c4a2e573c973c35f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/representation_layer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..acdc5132fd7f65279b1d1611207eb25b8a10cf74 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor_layer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor_layer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c3c4e2905b0e2997b857685b25d1e6549034e33 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/reward_predictor_layer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/sequence_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/sequence_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d60c5c39f813bf286717c64c48876b04a43b23e2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/sequence_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/vector_decoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/vector_decoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a6fc38a53c2f3d895bd4b03344a9c2ebe48a96e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/__pycache__/vector_decoder.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..d5434d8aca315df28f6d17ede500c6596df40681 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/continue_predictor.py @@ -0,0 +1,94 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf +""" +from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP +from ray.rllib.algorithms.dreamerv3.utils import ( + get_gru_units, + get_num_z_classes, + get_num_z_categoricals, +) +from ray.rllib.utils.framework import try_import_tf, try_import_tfp + +_, tf, _ = try_import_tf() +tfp = try_import_tfp() + + +class ContinuePredictor(tf.keras.Model): + """The world-model network sub-component used to predict the `continue` flags . + + Predicted continue flags are used to produce "dream data" to learn the policy in. + + The continue flags are predicted via a linear output used to parameterize a + Bernoulli distribution, from which simply the mode is used (no stochastic + sampling!). In other words, if the sigmoid of the output of the linear layer is + >0.5, we predict a continuation of the episode, otherwise we predict an episode + terminal. + """ + + def __init__(self, *, model_size: str = "XS"): + """Initializes a ContinuePredictor instance. + + Args: + model_size: The "Model Size" used according to [1] Appendinx B. + Determines the exact size of the underlying MLP. + """ + super().__init__(name="continue_predictor") + self.model_size = model_size + self.mlp = MLP(model_size=model_size, output_layer_size=1) + + # Trace self.call. + dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32 + self.call = tf.function( + input_signature=[ + tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type), + tf.TensorSpec( + shape=[ + None, + get_num_z_categoricals(model_size), + get_num_z_classes(model_size), + ], + dtype=dl_type, + ), + ] + )(self.call) + + def call(self, h, z): + """Performs a forward pass through the continue predictor. + + Args: + h: The deterministic hidden state of the sequence model. [B, dim(h)]. + z: The stochastic discrete representations of the original + observation input. [B, num_categoricals, num_classes]. + """ + # Flatten last two dims of z. + assert len(z.shape) == 3 + z_shape = tf.shape(z) + z = tf.reshape(z, shape=(z_shape[0], -1)) + assert len(z.shape) == 2 + out = tf.concat([h, z], axis=-1) + out.set_shape( + [ + None, + ( + get_num_z_categoricals(self.model_size) + * get_num_z_classes(self.model_size) + + get_gru_units(self.model_size) + ), + ] + ) + # Send h-cat-z through MLP. + out = self.mlp(out) + # Remove the extra [B, 1] dimension at the end to get a proper Bernoulli + # distribution. Otherwise, tfp will think that the batch dims are [B, 1] + # where they should be just [B]. + logits = tf.cast(tf.squeeze(out, axis=-1), tf.float32) + # Create the Bernoulli distribution object. + bernoulli = tfp.distributions.Bernoulli(logits=logits, dtype=tf.float32) + + # Take the mode (greedy, deterministic "sample"). + continue_ = bernoulli.mode() + + # Return Bernoulli sample (whether to continue) OR (continue?, Bernoulli prob). + return continue_, bernoulli diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..7daedf90ff5ab04d1da2ac0781b83a1210604b92 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/dynamics_predictor.py @@ -0,0 +1,84 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf +""" +from typing import Optional + +from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP +from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import ( + RepresentationLayer, +) +from ray.rllib.algorithms.dreamerv3.utils import get_gru_units +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() + + +class DynamicsPredictor(tf.keras.Model): + """The dynamics (or "prior") network described in [1], producing prior z-states. + + The dynamics net is used to: + - compute the initial z-state (from the tanh'd initial h-state variable) at the + beginning of a sequence. + - compute prior-z-states during dream data generation. Note that during dreaming, + no actual observations are available and thus no posterior z-states can be computed. + """ + + def __init__( + self, + *, + model_size: Optional[str] = "XS", + num_categoricals: Optional[int] = None, + num_classes_per_categorical: Optional[int] = None, + ): + """Initializes a DynamicsPredictor instance. + + Args: + model_size: The "Model Size" used according to [1] Appendinx B. + Use None for manually setting the different parameters. + num_categoricals: Overrides the number of categoricals used in the z-states. + In [1], 32 is used for any model size. + num_classes_per_categorical: Overrides the number of classes within each + categorical used for the z-states. In [1], 32 is used for any model + dimension. + """ + super().__init__(name="dynamics_predictor") + + self.mlp = MLP( + # In author's original code, the Dynamics Net only has a single layer, no + # matter the model size. + num_dense_layers=1, + model_size=model_size, + output_layer_size=None, + ) + # The (prior) z-state generating layer. + self.representation_layer = RepresentationLayer( + model_size=model_size, + num_categoricals=num_categoricals, + num_classes_per_categorical=num_classes_per_categorical, + ) + + # Trace self.call. + dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32 + self.call = tf.function( + input_signature=[ + tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type), + ] + )(self.call) + + def call(self, h): + """Performs a forward pass through the dynamics (or "prior") network. + + Args: + h: The deterministic hidden state of the sequence model. + + Returns: + Tuple consisting of a differentiable z-sample and the probabilities for the + categorical distribution (in the shape of [B, num_categoricals, + num_classes]) that created this sample. + """ + # Send internal state through MLP. + out = self.mlp(h) + # Generate a z vector (stochastic, discrete sample). + return self.representation_layer(out) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/mlp.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..435d9f8544ab3e40d343f39edf49da1be3b48ad0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/mlp.py @@ -0,0 +1,104 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from typing import Optional + +from ray.rllib.algorithms.dreamerv3.utils import ( + get_dense_hidden_units, + get_num_dense_layers, +) +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() + + +class MLP(tf.keras.Model): + """An MLP primitive used by several DreamerV3 components and described in [1] Fig 5. + + MLP=multi-layer perceptron. + + See Appendix B in [1] for the MLP sizes depending on the given `model_size`. + """ + + def __init__( + self, + *, + model_size: Optional[str] = "XS", + num_dense_layers: Optional[int] = None, + dense_hidden_units: Optional[int] = None, + output_layer_size=None, + trainable: bool = True, + name: Optional[str] = None + ): + """Initializes an MLP instance. + + Args: + model_size: The "Model Size" used according to [1] Appendinx B. + Use None for manually setting the different network sizes. + num_dense_layers: The number of hidden layers in the MLP. If None, + will use `model_size` and appendix B to figure out this value. + dense_hidden_units: The number of nodes in each hidden layer. If None, + will use `model_size` and appendix B to figure out this value. + output_layer_size: The size of an optional linear (no activation) output + layer. If None, no output layer will be added on top of the MLP dense + stack. + trainable: Whether the MLP is trainable (updated by an optimizer) or not. + name: An optional name for the MLP keras model. + """ + super().__init__(name=name or "mlp") + + num_dense_layers = get_num_dense_layers(model_size, override=num_dense_layers) + dense_hidden_units = get_dense_hidden_units( + model_size, override=dense_hidden_units + ) + + self.dense_layers = [] + for _ in range(num_dense_layers): + self.dense_layers.append( + tf.keras.layers.Dense( + dense_hidden_units, + trainable=trainable, + # Use no biases, iff there is LayerNormalization + # (which there always is), and perform the activation after the + # layer normalization. + activation=None, + use_bias=False, + ) + ) + + self.layer_normalizations = [] + for _ in range(len(self.dense_layers)): + self.layer_normalizations.append( + tf.keras.layers.LayerNormalization(trainable=trainable) + ) + + self.output_layer = None + if output_layer_size: + self.output_layer = tf.keras.layers.Dense( + output_layer_size, activation=None, trainable=trainable + ) + + def call(self, input_): + """Performs a forward pass through this MLP. + + Args: + input_: The input tensor for the MLP dense stack. + """ + out = input_ + + for dense_layer, layer_norm in zip( + self.dense_layers, self.layer_normalizations + ): + # In this order: layer, normalization, activation. + out = tf.nn.silu(layer_norm(dense_layer(out))) + + if self.output_layer is not None: + out = self.output_layer(out) + + return out diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..ec344f470d5e095398dc77029275432249639288 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/representation_layer.py @@ -0,0 +1,130 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from typing import Optional + +from ray.rllib.algorithms.dreamerv3.utils import ( + get_num_z_categoricals, + get_num_z_classes, +) +from ray.rllib.utils.framework import try_import_tf, try_import_tfp + +_, tf, _ = try_import_tf() +tfp = try_import_tfp() + + +class RepresentationLayer(tf.keras.layers.Layer): + """A representation (z-state) generating layer. + + The value for z is the result of sampling from a categorical distribution with + shape B x `num_classes`. So a computed z-state consists of `num_categoricals` + one-hot vectors, each of size `num_classes_per_categorical`. + """ + + def __init__( + self, + *, + model_size: Optional[str] = "XS", + num_categoricals: Optional[int] = None, + num_classes_per_categorical: Optional[int] = None, + ): + """Initializes a RepresentationLayer instance. + + Args: + model_size: The "Model Size" used according to [1] Appendinx B. + Use None for manually setting the different parameters. + num_categoricals: Overrides the number of categoricals used in the z-states. + In [1], 32 is used for any model size. + num_classes_per_categorical: Overrides the number of classes within each + categorical used for the z-states. In [1], 32 is used for any model + dimension. + """ + self.num_categoricals = get_num_z_categoricals( + model_size, override=num_categoricals + ) + self.num_classes_per_categorical = get_num_z_classes( + model_size, override=num_classes_per_categorical + ) + + super().__init__( + name=f"z{self.num_categoricals}x{self.num_classes_per_categorical}" + ) + + self.z_generating_layer = tf.keras.layers.Dense( + self.num_categoricals * self.num_classes_per_categorical, + activation=None, + ) + + def call(self, inputs): + """Produces a discrete, differentiable z-sample from some 1D input tensor. + + Pushes the input_ tensor through our dense layer, which outputs + 32(B=num categoricals)*32(c=num classes) logits. Logits are used to: + + 1) sample stochastically + 2) compute probs (via softmax) + 3) make sure the sampling step is differentiable (see [2] Algorithm 1): + sample=one_hot(draw(logits)) + probs=softmax(logits) + sample=sample + probs - stop_grad(probs) + -> Now sample has the gradients of the probs. + + Args: + inputs: The input to our z-generating layer. This might be a) the combined + (concatenated) outputs of the (image?) encoder + the last hidden + deterministic state, or b) the output of the dynamics predictor MLP + network. + + Returns: + Tuple consisting of a differentiable z-sample and the probabilities for the + categorical distribution (in the shape of [B, num_categoricals, + num_classes]) that created this sample. + """ + # Compute the logits (no activation) for our `num_categoricals` Categorical + # distributions (with `num_classes_per_categorical` classes each). + logits = self.z_generating_layer(inputs) + # Reshape the logits to [B, num_categoricals, num_classes] + logits = tf.reshape( + logits, + shape=(-1, self.num_categoricals, self.num_classes_per_categorical), + ) + # Compute the probs (based on logits) via softmax. + probs = tf.nn.softmax(tf.cast(logits, tf.float32)) + # Add the unimix weighting (1% uniform) to the probs. + # See [1]: "Unimix categoricals: We parameterize the categorical distributions + # for the world model representations and dynamics, as well as for the actor + # network, as mixtures of 1% uniform and 99% neural network output to ensure + # a minimal amount of probability mass on every class and thus keep log + # probabilities and KL divergences well behaved." + probs = 0.99 * probs + 0.01 * (1.0 / self.num_classes_per_categorical) + + # Danijar's code does: distr = [Distr class](logits=tf.log(probs)). + # Not sure why we don't directly use the already available probs instead. + logits = tf.math.log(probs) + + # Create the distribution object using the unimix'd logits. + distribution = tfp.distributions.Independent( + tfp.distributions.OneHotCategorical(logits=logits), + reinterpreted_batch_ndims=1, + ) + + # Draw a one-hot sample (B, num_categoricals, num_classes). + sample = tf.cast(distribution.sample(), tf.float32) + # Make sure we can take gradients "straight-through" the sampling step + # by adding the probs and subtracting the sg(probs). Note that `sample` + # does not have any gradients as it's the result of a Categorical sample step, + # which is non-differentiable (other than say a Gaussian sample step). + # [1] "The representations are sampled from a vector of softmax distributions + # and we take straight-through gradients through the sampling step." + # [2] Algorithm 1. + differentiable_sample = tf.cast( + (tf.stop_gradient(sample) + probs - tf.stop_gradient(probs)), + tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32, + ) + return differentiable_sample, probs diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..3e7cb6de93f97c4268d9067217b7be60624f0073 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor.py @@ -0,0 +1,112 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf +""" +from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP +from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import ( + RewardPredictorLayer, +) +from ray.rllib.algorithms.dreamerv3.utils import ( + get_gru_units, + get_num_z_categoricals, + get_num_z_classes, +) +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() + + +class RewardPredictor(tf.keras.Model): + """Wrapper of MLP and RewardPredictorLayer to predict rewards for the world model. + + Predicted rewards are used to produce "dream data" to learn the policy in. + """ + + def __init__( + self, + *, + model_size: str = "XS", + num_buckets: int = 255, + lower_bound: float = -20.0, + upper_bound: float = 20.0, + ): + """Initializes a RewardPredictor instance. + + Args: + model_size: The "Model Size" used according to [1] Appendinx B. + Determines the exact size of the underlying MLP. + num_buckets: The number of buckets to create. Note that the number of + possible symlog'd outcomes from the used distribution is + `num_buckets` + 1: + lower_bound --bucket-- o[1] --bucket-- o[2] ... --bucket-- upper_bound + o=outcomes + lower_bound=o[0] + upper_bound=o[num_buckets] + lower_bound: The symlog'd lower bound for a possible reward value. + Note that a value of -20.0 here already allows individual (actual env) + rewards to be as low as -400M. Buckets will be created between + `lower_bound` and `upper_bound`. + upper_bound: The symlog'd upper bound for a possible reward value. + Note that a value of +20.0 here already allows individual (actual env) + rewards to be as high as 400M. Buckets will be created between + `lower_bound` and `upper_bound`. + """ + super().__init__(name="reward_predictor") + self.model_size = model_size + + self.mlp = MLP( + model_size=model_size, + output_layer_size=None, + ) + self.reward_layer = RewardPredictorLayer( + num_buckets=num_buckets, + lower_bound=lower_bound, + upper_bound=upper_bound, + ) + + # Trace self.call. + dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32 + self.call = tf.function( + input_signature=[ + tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type), + tf.TensorSpec( + shape=[ + None, + get_num_z_categoricals(model_size), + get_num_z_classes(model_size), + ], + dtype=dl_type, + ), + ] + )(self.call) + + def call(self, h, z): + """Computes the expected reward using N equal sized buckets of possible values. + + Args: + h: The deterministic hidden state of the sequence model. [B, dim(h)]. + z: The stochastic discrete representations of the original + observation input. [B, num_categoricals, num_classes]. + """ + # Flatten last two dims of z. + assert len(z.shape) == 3 + z_shape = tf.shape(z) + z = tf.reshape(z, shape=(z_shape[0], -1)) + assert len(z.shape) == 2 + out = tf.concat([h, z], axis=-1) + out.set_shape( + [ + None, + ( + get_num_z_categoricals(self.model_size) + * get_num_z_classes(self.model_size) + + get_gru_units(self.model_size) + ), + ] + ) + # Send h-cat-z through MLP. + out = self.mlp(out) + # Return a) mean reward OR b) a tuple: (mean reward, logits over the reward + # buckets). + return self.reward_layer(out) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..d68f62cb6780f2ef044bb8f091c727b49b16e390 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/reward_predictor_layer.py @@ -0,0 +1,110 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() + + +class RewardPredictorLayer(tf.keras.layers.Layer): + """A layer outputting reward predictions using K bins and two-hot encoding. + + This layer is used in two models in DreamerV3: The reward predictor of the world + model and the value function. K is 255 by default (see [1]) and doesn't change + with the model size. + + Possible predicted reward/values range from symexp(-20.0) to symexp(20.0), which + should cover any possible environment. Outputs of this layer are generated by + generating logits/probs via a single linear layer, then interpreting the probs + as weights for a weighted average of the different possible reward (binned) values. + """ + + def __init__( + self, + *, + num_buckets: int = 255, + lower_bound: float = -20.0, + upper_bound: float = 20.0, + trainable: bool = True, + ): + """Initializes a RewardPredictorLayer instance. + + Args: + num_buckets: The number of buckets to create. Note that the number of + possible symlog'd outcomes from the used distribution is + `num_buckets` + 1: + lower_bound --bucket-- o[1] --bucket-- o[2] ... --bucket-- upper_bound + o=outcomes + lower_bound=o[0] + upper_bound=o[num_buckets] + lower_bound: The symlog'd lower bound for a possible reward value. + Note that a value of -20.0 here already allows individual (actual env) + rewards to be as low as -400M. Buckets will be created between + `lower_bound` and `upper_bound`. + upper_bound: The symlog'd upper bound for a possible reward value. + Note that a value of +20.0 here already allows individual (actual env) + rewards to be as high as 400M. Buckets will be created between + `lower_bound` and `upper_bound`. + """ + self.num_buckets = num_buckets + super().__init__(name=f"reward_layer_{self.num_buckets}buckets") + + self.lower_bound = lower_bound + self.upper_bound = upper_bound + self.reward_buckets_layer = tf.keras.layers.Dense( + units=self.num_buckets, + activation=None, + # From [1]: + # "We further noticed that the randomly initialized reward predictor and + # critic networks at the start of training can result in large predicted + # rewards that can delay the onset of learning. We initialize the output + # weights of the reward predictor and critic to zeros, which effectively + # alleviates the problem and accelerates early learning." + kernel_initializer="zeros", + bias_initializer="zeros", # zero-bias is default anyways + trainable=trainable, + ) + + def call(self, inputs): + """Computes the expected reward using N equal sized buckets of possible values. + + Args: + inputs: The input tensor for the layer, which computes the reward bucket + weights (logits). [B, dim]. + + Returns: + A tuple consisting of the expected rewards and the logits that parameterize + the tfp `FiniteDiscrete` distribution object. To get the individual bucket + probs, do `[FiniteDiscrete object].probs`. + """ + # Compute the `num_buckets` weights. + assert len(inputs.shape) == 2 + logits = tf.cast(self.reward_buckets_layer(inputs), tf.float32) + # out=[B, `num_buckets`] + + # Compute the expected(!) reward using the formula: + # `softmax(Linear(x))` [vectordot] `possible_outcomes`, where + # `possible_outcomes` is the even-spaced (binned) encoding of all possible + # symexp'd reward/values. + # [2]: "The mean of the reward predictor pφ(ˆrt | zˆt) is used as reward + # sequence rˆ1:H." + probs = tf.nn.softmax(logits) + possible_outcomes = tf.linspace( + self.lower_bound, + self.upper_bound, + self.num_buckets, + ) + # probs=possible_outcomes=[B, `num_buckets`] + + # Simple vector dot product (over last dim) to get the mean reward + # weighted sum, where all weights sum to 1.0. + expected_rewards = tf.reduce_sum(probs * possible_outcomes, axis=-1) + # expected_rewards=[B] + + return expected_rewards, logits diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py new file mode 100644 index 0000000000000000000000000000000000000000..fa9666029ce30de815b817e2c956a0bae97b816d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/sequence_model.py @@ -0,0 +1,144 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf +""" +from typing import Optional + +import gymnasium as gym +import numpy as np + +from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP +from ray.rllib.algorithms.dreamerv3.utils import ( + get_gru_units, + get_num_z_classes, + get_num_z_categoricals, +) +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() + + +class SequenceModel(tf.keras.Model): + """The "sequence model" of the RSSM, computing ht+1 given (ht, zt, at). + + Note: The "internal state" always consists of: + The actions `a` (initially, this is a zeroed-out action), `h`-states (deterministic, + continuous), and `z`-states (stochastic, discrete). + There are two versions of z-states: "posterior" for world model training and "prior" + for creating the dream data. + + Initial internal state values (`a`, `h`, and `z`) are used where ever a new episode + starts within a batch row OR at the beginning of each train batch's B rows, + regardless of whether there was an actual episode boundary or not. Thus, internal + states are not required to be stored in or retrieved from the replay buffer AND + retrieved batches from the buffer must not be zero padded. + + Initial `a` is the zero "one hot" action, e.g. [0.0, 0.0] for Discrete(2), initial + `h` is a separate learned variable, and initial `z` are computed by the "dynamics" + (or "prior") net, using only the initial-h state as input. + + The GRU in this SequenceModel always produces the next h-state, then. + """ + + def __init__( + self, + *, + model_size: Optional[str] = "XS", + action_space: gym.Space, + num_gru_units: Optional[int] = None, + ): + """Initializes a SequenceModel instance. + + Args: + model_size: The "Model Size" used according to [1] Appendinx B. + Use None for manually setting the number of GRU units used. + action_space: The action space of the environment used. + num_gru_units: Overrides the number of GRU units (dimension of the h-state). + If None, use the value given through `model_size` + (see [1] Appendix B). + """ + super().__init__(name="sequence_model") + + self.model_size = model_size + self.action_space = action_space + num_gru_units = get_gru_units(self.model_size, override=num_gru_units) + + # In Danijar's code, there is an additional layer (units=[model_size]) + # prior to the GRU (but always only with 1 layer), which is not mentioned in + # the paper. + self.pre_gru_layer = MLP( + num_dense_layers=1, + model_size=self.model_size, + output_layer_size=None, + ) + self.gru_unit = tf.keras.layers.GRU( + num_gru_units, + return_sequences=False, + return_state=False, + # Note: Changing these activations is most likely a bad idea! + # In experiments, setting one of both of them to silu deteriorated + # performance significantly. + # activation=tf.nn.silu, + # recurrent_activation=tf.nn.silu, + ) + + # Trace self.call. + dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32 + self.call = tf.function( + input_signature=[ + tf.TensorSpec( + shape=[None] + + ( + [action_space.n] + if isinstance(action_space, gym.spaces.Discrete) + else list(action_space.shape) + ), + dtype=dl_type, + ), + tf.TensorSpec(shape=[None, num_gru_units], dtype=dl_type), + tf.TensorSpec( + shape=[ + None, + get_num_z_categoricals(self.model_size), + get_num_z_classes(self.model_size), + ], + dtype=dl_type, + ), + ] + )(self.call) + + def call(self, a, h, z): + """ + + Args: + a: The previous action (already one-hot'd if applicable). (B, ...). + h: The previous deterministic hidden state of the sequence model. + (B, num_gru_units) + z: The previous stochastic discrete representations of the original + observation input. (B, num_categoricals, num_classes_per_categorical). + """ + # Flatten last two dims of z. + z_shape = tf.shape(z) + z = tf.reshape(z, shape=(z_shape[0], -1)) + out = tf.concat([z, a], axis=-1) + out.set_shape( + [ + None, + ( + get_num_z_categoricals(self.model_size) + * get_num_z_classes(self.model_size) + + ( + self.action_space.n + if isinstance(self.action_space, gym.spaces.Discrete) + else int(np.prod(self.action_space.shape)) + ) + ), + ] + ) + # Pass through pre-GRU layer. + out = self.pre_gru_layer(out) + # Pass through (batch-major) GRU (expand axis=1 as the time axis). + h_next = self.gru_unit(tf.expand_dims(out, axis=1), initial_state=h) + # Return the GRU's output (the next h-state). + return h_next diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fe7b58cf515ee0c01b83dac8b515a07489c11c91 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__init__.py @@ -0,0 +1,168 @@ +""" +Utility functions for the DreamerV3 ([1]) algorithm. + +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf +""" + +_ALLOWED_MODEL_DIMS = [ + # RLlib debug sizes (not mentioned in [1]). + "nano", + "micro", + "mini", + "XXS", + # Regular sizes (listed in table B in [1]). + "XS", + "S", + "M", + "L", + "XL", +] + + +def get_cnn_multiplier(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + cnn_multipliers = { + "nano": 2, + "micro": 4, + "mini": 8, + "XXS": 16, + "XS": 24, + "S": 32, + "M": 48, + "L": 64, + "XL": 96, + } + return cnn_multipliers[model_size] + + +def get_dense_hidden_units(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + dense_units = { + "nano": 16, + "micro": 32, + "mini": 64, + "XXS": 128, + "XS": 256, + "S": 512, + "M": 640, + "L": 768, + "XL": 1024, + } + return dense_units[model_size] + + +def get_gru_units(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + gru_units = { + "nano": 16, + "micro": 32, + "mini": 64, + "XXS": 128, + "XS": 256, + "S": 512, + "M": 1024, + "L": 2048, + "XL": 4096, + } + return gru_units[model_size] + + +def get_num_z_categoricals(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + gru_units = { + "nano": 4, + "micro": 8, + "mini": 16, + "XXS": 32, + "XS": 32, + "S": 32, + "M": 32, + "L": 32, + "XL": 32, + } + return gru_units[model_size] + + +def get_num_z_classes(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + gru_units = { + "nano": 4, + "micro": 8, + "mini": 16, + "XXS": 32, + "XS": 32, + "S": 32, + "M": 32, + "L": 32, + "XL": 32, + } + return gru_units[model_size] + + +def get_num_curiosity_nets(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + num_curiosity_nets = { + "nano": 8, + "micro": 8, + "mini": 8, + "XXS": 8, + "XS": 8, + "S": 8, + "M": 8, + "L": 8, + "XL": 8, + } + return num_curiosity_nets[model_size] + + +def get_num_dense_layers(model_size, override=None): + if override is not None: + return override + + assert model_size in _ALLOWED_MODEL_DIMS + num_dense_layers = { + "nano": 1, + "micro": 1, + "mini": 1, + "XXS": 1, + "XS": 1, + "S": 2, + "M": 3, + "L": 4, + "XL": 5, + } + return num_dense_layers[model_size] + + +def do_symlog_obs(observation_space, symlog_obs_user_setting): + # If our symlog_obs setting is NOT set specifically (it's set to "auto"), return + # True if we don't have an image observation space, otherwise return False. + + # TODO (sven): Support mixed observation spaces. + + is_image_space = len(observation_space.shape) in [2, 3] + return ( + not is_image_space + if symlog_obs_user_setting == "auto" + else symlog_obs_user_setting + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9393df4c3a8af2417004658b1d27dbc7b1f4a5fb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/debugging.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/debugging.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a2f73d569ddc398c568080cdd0e4c6570181059 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/debugging.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/env_runner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/env_runner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..179b333f6f91f763fc37fa02f71d96fddfe6db26 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/env_runner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/summaries.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/summaries.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2309af589b96456fb235432d0bc68e60cd592b5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/__pycache__/summaries.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/debugging.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/debugging.py new file mode 100644 index 0000000000000000000000000000000000000000..7ddbd8341ddb883a8be02ec7db90733141800a89 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/debugging.py @@ -0,0 +1,190 @@ +import gymnasium as gym +import numpy as np +from PIL import Image, ImageDraw + +from gymnasium.envs.classic_control.cartpole import CartPoleEnv + +from ray.rllib.utils.framework import try_import_tf + +_, tf, _ = try_import_tf() + + +class CartPoleDebug(CartPoleEnv): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + low = np.concatenate([np.array([0.0]), self.observation_space.low]) + high = np.concatenate([np.array([1000.0]), self.observation_space.high]) + + self.observation_space = gym.spaces.Box(low, high, shape=(5,), dtype=np.float32) + + self.timesteps_ = 0 + self._next_action = 0 + self._seed = 1 + + def reset(self, *, seed=None, options=None): + ret = super().reset(seed=self._seed) + self._seed += 1 + self.timesteps_ = 0 + self._next_action = 0 + obs = np.concatenate([np.array([self.timesteps_]), ret[0]]) + return obs, ret[1] + + def step(self, action): + ret = super().step(self._next_action) + + self.timesteps_ += 1 + self._next_action = 0 if self._next_action else 1 + + obs = np.concatenate([np.array([self.timesteps_]), ret[0]]) + reward = 0.1 * self.timesteps_ + return (obs, reward) + ret[2:] + + +gym.register("CartPoleDebug-v0", CartPoleDebug) +cartpole_env = gym.make("CartPoleDebug-v0", render_mode="rgb_array") +cartpole_env.reset() + +frozenlake_env = gym.make( + "FrozenLake-v1", render_mode="rgb_array", is_slippery=False, map_name="4x4" +) # desc=["SF", "HG"]) +frozenlake_env.reset() + + +def create_cartpole_dream_image( + dreamed_obs, # real space (not symlog'd) + dreamed_V, # real space (not symlog'd) + dreamed_a, + dreamed_r_tp1, # real space (not symlog'd) + dreamed_ri_tp1, # intrinsic reward + dreamed_c_tp1, # continue flag + value_target, # real space (not symlog'd) + initial_h, + as_tensor=False, +): + # CartPoleDebug + if dreamed_obs.shape == (5,): + # Set the state of our env to the given observation. + cartpole_env.unwrapped.state = np.array(dreamed_obs[1:], dtype=np.float32) + # Normal CartPole-v1 + else: + cartpole_env.unwrapped.state = np.array(dreamed_obs, dtype=np.float32) + + # Produce an RGB-image of the current state. + rgb_array = cartpole_env.render() + + # Add value-, action-, reward-, and continue-prediction information. + image = Image.fromarray(rgb_array) + draw_obj = ImageDraw.Draw(image) + + # fnt = ImageFont.load_default(size=40) + + draw_obj.text( + (5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0) + ) # , font=fnt.font, size=30) + draw_obj.text( + (5, 18), + f"at={'<--' if dreamed_a == 0 else '-->'} ({dreamed_a})", + fill=(0, 0, 0), + ) + draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0)) + if dreamed_ri_tp1 is not None: + draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0)) + draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0)) + draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0)) + + if dreamed_obs.shape == (5,): + draw_obj.text((20, 100), f"t={dreamed_obs[0]}", fill=(0, 0, 0)) + + # Return image. + np_img = np.asarray(image) + if as_tensor: + return tf.convert_to_tensor(np_img, dtype=tf.uint8) + return np_img + + +def create_frozenlake_dream_image( + dreamed_obs, # real space (not symlog'd) + dreamed_V, # real space (not symlog'd) + dreamed_a, + dreamed_r_tp1, # real space (not symlog'd) + dreamed_ri_tp1, # intrinsic reward + dreamed_c_tp1, # continue flag + value_target, # real space (not symlog'd) + initial_h, + as_tensor=False, +): + frozenlake_env.unwrapped.s = np.argmax(dreamed_obs, axis=0) + + # Produce an RGB-image of the current state. + rgb_array = frozenlake_env.render() + + # Add value-, action-, reward-, and continue-prediction information. + image = Image.fromarray(rgb_array) + draw_obj = ImageDraw.Draw(image) + + draw_obj.text((5, 6), f"Vt={dreamed_V:.2f} (Rt={value_target:.2f})", fill=(0, 0, 0)) + action_arrow = ( + "<--" + if dreamed_a == 0 + else "v" + if dreamed_a == 1 + else "-->" + if dreamed_a == 2 + else "^" + ) + draw_obj.text((5, 18), f"at={action_arrow} ({dreamed_a})", fill=(0, 0, 0)) + draw_obj.text((5, 30), f"rt+1={dreamed_r_tp1:.2f}", fill=(0, 0, 0)) + if dreamed_ri_tp1 is not None: + draw_obj.text((5, 42), f"rit+1={dreamed_ri_tp1:.6f}", fill=(0, 0, 0)) + draw_obj.text((5, 54), f"ct+1={dreamed_c_tp1}", fill=(0, 0, 0)) + draw_obj.text((5, 66), f"|h|t={np.mean(np.abs(initial_h)):.5f}", fill=(0, 0, 0)) + + # Return image. + np_img = np.asarray(image) + if as_tensor: + return tf.convert_to_tensor(np_img, dtype=tf.uint8) + return np_img + + +if __name__ == "__main__": + # CartPole debug. + rgb_array = create_cartpole_dream_image( + dreamed_obs=np.array([100.0, 1.0, -0.01, 1.5, 0.02]), + dreamed_V=4.3, + dreamed_a=1, + dreamed_r_tp1=1.0, + dreamed_c_tp1=True, + initial_h=0.0, + value_target=8.0, + ) + # ImageFont.load("arial.pil") + image = Image.fromarray(rgb_array) + image.show() + + # Normal CartPole. + rgb_array = create_cartpole_dream_image( + dreamed_obs=np.array([1.0, -0.01, 1.5, 0.02]), + dreamed_V=4.3, + dreamed_a=1, + dreamed_r_tp1=1.0, + dreamed_c_tp1=True, + initial_h=0.1, + value_target=8.0, + ) + # ImageFont.load("arial.pil") + image = Image.fromarray(rgb_array) + image.show() + + # Frozenlake + rgb_array = create_frozenlake_dream_image( + dreamed_obs=np.array([1.0] + [0.0] * (frozenlake_env.observation_space.n - 1)), + dreamed_V=4.3, + dreamed_a=1, + dreamed_r_tp1=1.0, + dreamed_c_tp1=True, + initial_h=0.1, + value_target=8.0, + ) + image = Image.fromarray(rgb_array) + image.show() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/env_runner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/env_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..62932738fc1f8bc0ba033f57b8a79bd7cf267b77 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/env_runner.py @@ -0,0 +1,694 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from collections import defaultdict +from functools import partial +from typing import Collection, List, Optional, Tuple, Union + +import gymnasium as gym +from gymnasium.wrappers.vector import DictInfoToList +import numpy as np +import tree # pip install dm_tree + +import ray +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.core import COMPONENT_RL_MODULE, DEFAULT_AGENT_ID, DEFAULT_MODULE_ID +from ray.rllib.core.columns import Columns +from ray.rllib.env import INPUT_ENV_SPACES +from ray.rllib.env.env_runner import EnvRunner +from ray.rllib.env.single_agent_episode import SingleAgentEpisode +from ray.rllib.env.wrappers.atari_wrappers import NoopResetEnv, MaxAndSkipEnv +from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv +from ray.rllib.env.utils import _gym_env_creator +from ray.rllib.utils.annotations import override +from ray.rllib.utils.deprecation import Deprecated +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.metrics import ( + EPISODE_DURATION_SEC_MEAN, + EPISODE_LEN_MAX, + EPISODE_LEN_MEAN, + EPISODE_LEN_MIN, + EPISODE_RETURN_MAX, + EPISODE_RETURN_MEAN, + EPISODE_RETURN_MIN, + NUM_AGENT_STEPS_SAMPLED, + NUM_AGENT_STEPS_SAMPLED_LIFETIME, + NUM_EPISODES, + NUM_ENV_STEPS_SAMPLED, + NUM_ENV_STEPS_SAMPLED_LIFETIME, + NUM_MODULE_STEPS_SAMPLED, + NUM_MODULE_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger +from ray.rllib.utils.numpy import convert_to_numpy, one_hot +from ray.rllib.utils.spaces.space_utils import batch, unbatch +from ray.rllib.utils.torch_utils import convert_to_torch_tensor +from ray.rllib.utils.typing import ResultDict, StateDict +from ray.tune.registry import ENV_CREATOR, _global_registry + +_, tf, _ = try_import_tf() +torch, _ = try_import_torch() + + +# TODO (sven): Use SingleAgentEnvRunner instead of this as soon as we have the new +# ConnectorV2 example classes to make Atari work properly with these (w/o requiring the +# classes at the bottom of this file here, e.g. `ActionClip`). +class DreamerV3EnvRunner(EnvRunner): + """An environment runner to collect data from vectorized gymnasium environments.""" + + def __init__( + self, + config: AlgorithmConfig, + **kwargs, + ): + """Initializes a DreamerV3EnvRunner instance. + + Args: + config: The config to use to setup this EnvRunner. + """ + super().__init__(config=config) + + # Create the gym.vector.Env object. + # Atari env. + if self.config.env.startswith("ale_py:ALE/"): + # TODO (sven): This import currently causes a Tune test to fail. Either way, + # we need to figure out how to properly setup the CI environment with + # the correct versions of all gymnasium-related packages. + from supersuit.generic_wrappers import resize_v1 + + # [2]: "We down-scale the 84 × 84 grayscale images to 64 × 64 pixels so that + # we can apply the convolutional architecture of DreamerV1." + # ... + # "We follow the evaluation protocol of Machado et al. (2018) with 200M + # environment steps, action repeat of 4, a time limit of 108,000 steps per + # episode that correspond to 30 minutes of game play, no access to life + # information, full action space, and sticky actions. Because the world + # model integrates information over time, DreamerV2 does not use frame + # stacking." + # However, in Danijar's repo, Atari100k experiments are configured as: + # noop=30, 64x64x3 (no grayscaling), sticky actions=False, + # full action space=False, + + def _entry_point(): + return gym.make( + self.config.env, + **dict( + self.config.env_config, + **{ + # "sticky actions" but not according to Danijar's 100k + # configs. + "repeat_action_probability": 0.0, + # "full action space" but not according to Danijar's 100k + # configs. + "full_action_space": False, + # Already done by MaxAndSkip wrapper: "action repeat" == 4. + "frameskip": 1, + }, + ), + ) + + gym.register("rllib-single-agent-env-v0", entry_point=_entry_point) + + self.env = DictInfoToList( + gym.make_vec( + "rllib-single-agent-env-v0", + num_envs=self.config.num_envs_per_env_runner, + vectorization_mode=( + "async" if self.config.remote_worker_envs else "sync" + ), + wrappers=[ + partial(gym.wrappers.TimeLimit, max_episode_steps=108000), + partial(resize_v1, x_size=64, y_size=64), # resize to 64x64 + NormalizedImageEnv, + NoopResetEnv, + MaxAndSkipEnv, + ], + ) + ) + # DeepMind Control. + elif self.config.env.startswith("DMC/"): + parts = self.config.env.split("/") + assert len(parts) == 3, ( + "ERROR: DMC env must be formatted as 'DMC/[task]/[domain]', e.g. " + f"'DMC/cartpole/swingup'! You provided '{self.config.env}'." + ) + gym.register( + "dmc_env-v0", + lambda from_pixels=True: DMCEnv( + parts[1], parts[2], from_pixels=from_pixels, channels_first=False + ), + ) + self.env = DictInfoToList( + gym.make_vec( + "dmc_env-v0", + wrappers=[ActionClip], + num_envs=self.config.num_envs_per_env_runner, + vectorization_mode=( + "async" if self.config.remote_worker_envs else "sync" + ), + **dict(self.config.env_config), + ) + ) + # All other envs (gym or `tune.register_env()`'d by the user). + else: + # Register the env in this local context here. + gym.register( + "dreamerv3-custom-env-v0", + partial( + _global_registry.get(ENV_CREATOR, self.config.env), + self.config.env_config, + ) + if _global_registry.contains(ENV_CREATOR, self.config.env) + else partial( + _gym_env_creator, + env_context=self.config.env_config, + env_descriptor=self.config.env, + ), + ) + # Wrap into `DictInfoToList` wrapper to get infos as lists. + self.env = DictInfoToList( + gym.make_vec( + "dreamerv3-custom-env-v0", + num_envs=self.config.num_envs_per_env_runner, + vectorization_mode=( + "async" if self.config.remote_worker_envs else "sync" + ), + ) + ) + self.num_envs = self.env.num_envs + assert self.num_envs == self.config.num_envs_per_env_runner + + # Create our RLModule to compute actions with. + policy_dict, _ = self.config.get_multi_agent_setup(env=self.env) + self.multi_rl_module_spec = self.config.get_multi_rl_module_spec( + policy_dict=policy_dict + ) + if self.config.share_module_between_env_runner_and_learner: + # DreamerV3 Algorithm will set this to the local Learner's module. + self.module = None + # Create our own instance of a DreamerV3RLModule (which then needs to be + # weight-synched each iteration). + else: + # TODO (sven): DreamerV3 is currently single-agent only. + self.module = self.multi_rl_module_spec.build()[DEFAULT_MODULE_ID] + + self._cached_to_module = None + + self.metrics = MetricsLogger() + + self._device = None + if ( + torch + and torch.cuda.is_available() + and self.config.framework_str == "torch" + and self.config.share_module_between_env_runner_and_learner + and self.config.num_gpus_per_learner > 0 + ): + gpu_ids = ray.get_gpu_ids() + self._device = f"cuda:{gpu_ids[0]}" + self.convert_to_tensor = ( + partial(convert_to_torch_tensor, device=self._device) + if self.config.framework_str == "torch" + else tf.convert_to_tensor + ) + + self._needs_initial_reset = True + self._episodes = [None for _ in range(self.num_envs)] + self._states = [None for _ in range(self.num_envs)] + + # TODO (sven): Move metrics temp storage and collection out of EnvRunner + # and RolloutWorkers. These classes should not continue tracking some data + # that they have already returned (in a call to `sample()`). Instead, the + # episode data should be analyzed where it was sent to (the Algorithm itself + # via its replay buffer, etc..). + self._done_episodes_for_metrics = [] + self._ongoing_episodes_for_metrics = defaultdict(list) + + @override(EnvRunner) + def sample( + self, + *, + num_timesteps: int = None, + num_episodes: int = None, + explore: bool = True, + random_actions: bool = False, + ) -> Tuple[List[SingleAgentEpisode], List[SingleAgentEpisode]]: + """Runs and returns a sample (n timesteps or m episodes) on the environment(s). + + Timesteps or episodes are counted in total (across all vectorized + sub-environments). For example, if self.num_envs=2 and num_timesteps=10, each + sub-environment will be sampled for 5 steps. If self.num_envs=3 and + num_episodes=30, each sub-environment will be sampled for 10 episodes. + + Args: + num_timesteps: The number of timesteps to sample from the environment(s). + Note that only exactly one of `num_timesteps` or `num_episodes` must be + provided. + num_episodes: The number of full episodes to sample from the environment(s). + Note that only exactly one of `num_timesteps` or `num_episodes` must be + provided. + explore: Indicates whether to utilize exploration when picking actions. + random_actions: Whether to only use random actions. If True, the value of + `explore` is ignored. + force_reset: Whether to reset the environment(s) before starting to sample. + If False, will still reset the environment(s) if they were left in + a terminated or truncated state during previous sample calls. + + Returns: + A tuple consisting of a) list of Episode instances that are done and + b) list of Episode instances that are still ongoing. + """ + # If no execution details are provided, use self.config. + if num_timesteps is None and num_episodes is None: + if self.config.batch_mode == "truncate_episodes": + num_timesteps = self.config.rollout_fragment_length * self.num_envs + else: + num_episodes = self.num_envs + + # Sample n timesteps. + if num_timesteps is not None: + return self._sample( + num_timesteps=num_timesteps, + explore=explore, + random_actions=random_actions, + force_reset=False, + ) + # Sample n episodes. + else: + # `_sample_episodes` returns only one list (with completed episodes) + # return empty list for incomplete ones. + return ( + self._sample( + num_episodes=num_episodes, + explore=explore, + random_actions=random_actions, + ), + [], + ) + + def _sample( + self, + *, + num_timesteps: Optional[int] = None, + num_episodes: Optional[int] = None, + explore: bool = True, + random_actions: bool = False, + force_reset: bool = False, + ) -> List[SingleAgentEpisode]: + """Helper method to sample n timesteps or m episodes.""" + + done_episodes_to_return: List[SingleAgentEpisode] = [] + + # Get initial states for all `batch_size_B` rows in the forward batch. + initial_states = tree.map_structure( + lambda s: np.repeat(s, self.num_envs, axis=0), + convert_to_numpy(self.module.get_initial_state()), + ) + + # Have to reset the env (on all vector sub-envs). + if force_reset or num_episodes is not None or self._needs_initial_reset: + episodes = self._episodes = [None for _ in range(self.num_envs)] + self._reset_envs(episodes, initial_states) + # We just reset the env. Don't have to force this again in the next + # call to `self._sample()`. + self._needs_initial_reset = False + + # Set initial obs and states in the episodes. + for i in range(self.num_envs): + self._states[i] = None + else: + episodes = self._episodes + + # Loop through `num_timesteps` timesteps or `num_episodes` episodes. + ts = 0 + eps = 0 + while ( + (ts < num_timesteps) if num_timesteps is not None else (eps < num_episodes) + ): + # Act randomly. + if random_actions: + actions = self.env.action_space.sample() + # Compute an action using the RLModule. + else: + # Env-to-module connector (already cached). + to_module = self._cached_to_module + assert to_module is not None + self._cached_to_module = None + + # RLModule forward pass: Explore or not. + if explore: + to_env = self.module.forward_exploration(to_module) + else: + to_env = self.module.forward_inference(to_module) + + # Model outputs one-hot actions (if discrete). Convert to int actions + # as well. + actions = convert_to_numpy(to_env[Columns.ACTIONS]) + if isinstance(self.env.single_action_space, gym.spaces.Discrete): + actions = np.argmax(actions, axis=-1) + self._states = unbatch(convert_to_numpy(to_env[Columns.STATE_OUT])) + + observations, rewards, terminateds, truncateds, infos = self.env.step( + actions + ) + + call_on_episode_start = set() + for env_index in range(self.num_envs): + # Episode has no data in it yet -> Was just reset and needs to be called + # with its `add_env_reset()` method. + if not episodes[env_index].is_reset: + episodes[env_index].add_env_reset( + observation=observations[env_index], + infos=infos[env_index], + ) + call_on_episode_start.add(env_index) + self._states[env_index] = None + + # Call `add_env_step()` method on episode. + else: + # Only increase ts when we actually stepped (not reset'd as a reset + # does not count as a timestep). + ts += 1 + episodes[env_index].add_env_step( + observation=observations[env_index], + action=actions[env_index], + reward=rewards[env_index], + infos=infos[env_index], + terminated=terminateds[env_index], + truncated=truncateds[env_index], + ) + + # Cache results as we will do the RLModule forward pass only in the next + # `while`-iteration. + if self.module is not None: + is_first = np.zeros((self.num_envs,)) + for env_index, episode in enumerate(episodes): + if self._states[env_index] is None: + is_first[env_index] = 1.0 + self._states[env_index] = { + k: s[env_index] for k, s in initial_states.items() + } + self._cached_to_module = { + Columns.STATE_IN: tree.map_structure( + lambda s: self.convert_to_tensor(s), batch(self._states) + ), + Columns.OBS: self.convert_to_tensor(observations), + "is_first": self.convert_to_tensor(is_first), + } + + for env_index in range(self.num_envs): + # Episode is not done. + if not episodes[env_index].is_done: + continue + + eps += 1 + + # Then numpy'ize the episode. + done_episodes_to_return.append(episodes[env_index].to_numpy()) + + # Also early-out if we reach the number of episodes within this + # for-loop. + if eps == num_episodes: + break + + # Create a new episode object with no data in it and execute + # `on_episode_created` callback (before the `env.reset()` call). + episodes[env_index] = SingleAgentEpisode( + observation_space=self.env.single_observation_space, + action_space=self.env.single_action_space, + ) + + # Return done episodes ... + # TODO (simon): Check, how much memory this attribute uses. + self._done_episodes_for_metrics.extend(done_episodes_to_return) + # ... and all ongoing episode chunks. + + # Also, make sure we start new episode chunks (continuing the ongoing episodes + # from the to-be-returned chunks). + ongoing_episodes_to_return = [] + # Only if we are doing individual timesteps: We have to maybe cut an ongoing + # episode and continue building it on the next call to `sample()`. + if num_timesteps is not None: + ongoing_episodes_continuations = [ + episode.cut(len_lookback_buffer=self.config.episode_lookback_horizon) + for episode in episodes + ] + + for episode in episodes: + # Just started Episodes do not have to be returned. There is no data + # in them anyway. + if episode.t == 0: + continue + episode.validate() + self._ongoing_episodes_for_metrics[episode.id_].append(episode) + # Return numpy'ized Episodes. + ongoing_episodes_to_return.append(episode.to_numpy()) + + # Continue collecting into the cut Episode chunks. + self._episodes = ongoing_episodes_continuations + + self._increase_sampled_metrics(ts) + + # Return collected episode data. + return done_episodes_to_return + ongoing_episodes_to_return + + def get_spaces(self): + return { + INPUT_ENV_SPACES: (self.env.observation_space, self.env.action_space), + DEFAULT_MODULE_ID: ( + self.env.single_observation_space, + self.env.single_action_space, + ), + } + + def get_metrics(self) -> ResultDict: + # Compute per-episode metrics (only on already completed episodes). + for eps in self._done_episodes_for_metrics: + assert eps.is_done + + episode_length = len(eps) + episode_return = eps.get_return() + episode_duration_s = eps.get_duration_s() + + # Don't forget about the already returned chunks of this episode. + if eps.id_ in self._ongoing_episodes_for_metrics: + for eps2 in self._ongoing_episodes_for_metrics[eps.id_]: + episode_length += len(eps2) + episode_return += eps2.get_return() + del self._ongoing_episodes_for_metrics[eps.id_] + + self._log_episode_metrics( + episode_length, episode_return, episode_duration_s + ) + + # Log num episodes counter for this iteration. + self.metrics.log_value( + NUM_EPISODES, + len(self._done_episodes_for_metrics), + reduce="sum", + # Reset internal data on `reduce()` call below (not a lifetime count). + clear_on_reduce=True, + ) + + # Now that we have logged everything, clear cache of done episodes. + self._done_episodes_for_metrics.clear() + + # Return reduced metrics. + return self.metrics.reduce() + + def get_state( + self, + components: Optional[Union[str, Collection[str]]] = None, + *, + not_components: Optional[Union[str, Collection[str]]] = None, + **kwargs, + ) -> StateDict: + """Returns the weights of our (single-agent) RLModule.""" + if self.module is None: + assert self.config.share_module_between_env_runner_and_learner + return {} + else: + return { + COMPONENT_RL_MODULE: { + DEFAULT_MODULE_ID: self.module.get_state(**kwargs), + }, + } + + def set_state(self, state: StateDict) -> None: + """Writes the weights of our (single-agent) RLModule.""" + if self.module is None: + assert self.config.share_module_between_env_runner_and_learner + else: + self.module.set_state(state[COMPONENT_RL_MODULE][DEFAULT_MODULE_ID]) + + @override(EnvRunner) + def assert_healthy(self): + # Make sure, we have built our gym.vector.Env and RLModule properly. + assert self.env and self.module + + @override(EnvRunner) + def stop(self): + # Close our env object via gymnasium's API. + self.env.close() + + def _reset_envs(self, episodes, initial_states): + # Create n new episodes and make the `on_episode_created` callbacks. + for env_index in range(self.num_envs): + self._new_episode(env_index, episodes) + + # Erase all cached ongoing episodes (these will never be completed and + # would thus never be returned/cleaned by `get_metrics` and cause a memory + # leak). + self._ongoing_episodes_for_metrics.clear() + + observations, infos = self.env.reset() + observations = unbatch(observations) + + # Set initial obs and infos in the episodes. + for env_index in range(self.num_envs): + episodes[env_index].add_env_reset( + observation=observations[env_index], + infos=infos[env_index], + ) + + # Run the env-to-module connector to make sure the reset-obs/infos have + # properly been processed (if applicable). + self._cached_to_module = None + if self.module: + is_first = np.zeros((self.num_envs,)) + for i, eps in enumerate(self._episodes): + if self._states[i] is None: + is_first[i] = 1.0 + self._states[i] = {k: s[i] for k, s in initial_states.items()} + self._cached_to_module = { + Columns.STATE_IN: tree.map_structure( + lambda s: self.convert_to_tensor(s), batch(self._states) + ), + Columns.OBS: self.convert_to_tensor(observations), + "is_first": self.convert_to_tensor(is_first), + } + # self._cached_to_module = TODO!! + + def _new_episode(self, env_index, episodes=None): + episodes = episodes if episodes is not None else self._episodes + episodes[env_index] = SingleAgentEpisode( + observation_space=self.env.single_observation_space, + action_space=self.env.single_action_space, + ) + + def _increase_sampled_metrics(self, num_steps): + # Per sample cycle stats. + self.metrics.log_value( + NUM_ENV_STEPS_SAMPLED, num_steps, reduce="sum", clear_on_reduce=True + ) + self.metrics.log_value( + (NUM_AGENT_STEPS_SAMPLED, DEFAULT_AGENT_ID), + num_steps, + reduce="sum", + clear_on_reduce=True, + ) + self.metrics.log_value( + (NUM_MODULE_STEPS_SAMPLED, DEFAULT_MODULE_ID), + num_steps, + reduce="sum", + clear_on_reduce=True, + ) + # Lifetime stats. + self.metrics.log_value(NUM_ENV_STEPS_SAMPLED_LIFETIME, num_steps, reduce="sum") + self.metrics.log_value( + (NUM_AGENT_STEPS_SAMPLED_LIFETIME, DEFAULT_AGENT_ID), + num_steps, + reduce="sum", + ) + self.metrics.log_value( + (NUM_MODULE_STEPS_SAMPLED_LIFETIME, DEFAULT_MODULE_ID), + num_steps, + reduce="sum", + ) + return num_steps + + def _log_episode_metrics(self, length, ret, sec): + # Log general episode metrics. + # To mimick the old API stack behavior, we'll use `window` here for + # these particular stats (instead of the default EMA). + win = self.config.metrics_num_episodes_for_smoothing + self.metrics.log_value(EPISODE_LEN_MEAN, length, window=win) + self.metrics.log_value(EPISODE_RETURN_MEAN, ret, window=win) + self.metrics.log_value(EPISODE_DURATION_SEC_MEAN, sec, window=win) + + # For some metrics, log min/max as well. + self.metrics.log_value(EPISODE_LEN_MIN, length, reduce="min") + self.metrics.log_value(EPISODE_RETURN_MIN, ret, reduce="min") + self.metrics.log_value(EPISODE_LEN_MAX, length, reduce="max") + self.metrics.log_value(EPISODE_RETURN_MAX, ret, reduce="max") + + @Deprecated( + new="DreamerV3EnvRunner.get_state(components='rl_module')", + error=True, + ) + def get_weights(self, *args, **kwargs): + pass + + @Deprecated( + new="DreamerV3EnvRunner.get_state()", + error=True, + ) + def set_weights(self, *args, **kwargs): + pass + + +class NormalizedImageEnv(gym.ObservationWrapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.observation_space = gym.spaces.Box( + -1.0, + 1.0, + shape=self.observation_space.shape, + dtype=np.float32, + ) + + # Divide by scale and center around 0.0, such that observations are in the range + # of -1.0 and 1.0. + def observation(self, observation): + return (observation.astype(np.float32) / 128.0) - 1.0 + + +class OneHot(gym.ObservationWrapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.observation_space = gym.spaces.Box( + 0.0, 1.0, shape=(self.observation_space.n,), dtype=np.float32 + ) + + def reset(self, **kwargs): + ret = self.env.reset(**kwargs) + return self._get_obs(ret[0]), ret[1] + + def step(self, action): + ret = self.env.step(action) + return self._get_obs(ret[0]), ret[1], ret[2], ret[3], ret[4] + + def _get_obs(self, obs): + return one_hot(obs, depth=self.observation_space.shape[0]) + + +class ActionClip(gym.ActionWrapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._low = -1.0 + self._high = 1.0 + self.action_space = gym.spaces.Box( + self._low, + self._high, + self.action_space.shape, + self.action_space.dtype, + ) + + def action(self, action): + return np.clip(action, self._low, self._high) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/summaries.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/summaries.py new file mode 100644 index 0000000000000000000000000000000000000000..c8b0ea753d4d2c078a2bfd14d80590b5161e2469 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/utils/summaries.py @@ -0,0 +1,408 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +import numpy as np + +from ray.rllib.algorithms.dreamerv3.utils.debugging import ( + create_cartpole_dream_image, + create_frozenlake_dream_image, +) +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core.columns import Columns +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.metrics import ( + LEARNER_RESULTS, + REPLAY_BUFFER_RESULTS, +) +from ray.rllib.utils.tf_utils import inverse_symlog + +torch, _ = try_import_torch() + + +def reconstruct_obs_from_h_and_z( + h_t0_to_H, + z_t0_to_H, + dreamer_model, + obs_dims_shape, + framework="torch", +): + """Returns""" + shape = h_t0_to_H.shape + T = shape[0] # inputs are time-major + B = shape[1] + # Compute actual observations using h and z and the decoder net. + # Note that the last h-state (T+1) is NOT used here as it's already part of + # a new trajectory. + # Use mean() of the Gaussian, no sample! -> No need to construct dist object here. + if framework == "torch": + device = next(iter(dreamer_model.world_model.decoder.parameters())).device + reconstructed_obs_distr_means_TxB = ( + dreamer_model.world_model.decoder( + # Fold time rank. + h=torch.from_numpy(h_t0_to_H).reshape((T * B, -1)).to(device), + z=torch.from_numpy(z_t0_to_H) + .reshape((T * B,) + z_t0_to_H.shape[2:]) + .to(device), + ) + .detach() + .cpu() + .numpy() + ) + else: + reconstructed_obs_distr_means_TxB = dreamer_model.world_model.decoder( + # Fold time rank. + h=h_t0_to_H.reshape((T * B, -1)), + z=z_t0_to_H.reshape((T * B,) + z_t0_to_H.shape[2:]), + ) + + # Unfold time rank again. + reconstructed_obs_T_B = np.reshape( + reconstructed_obs_distr_means_TxB, (T, B) + obs_dims_shape + ) + # Return inverse symlog'd (real env obs space) reconstructed observations. + return reconstructed_obs_T_B + + +def report_dreamed_trajectory( + *, + results, + env, + dreamer_model, + obs_dims_shape, + batch_indices=(0,), + desc=None, + include_images=True, + framework="torch", +): + if not include_images: + return + + dream_data = results["dream_data"] + dreamed_obs_H_B = reconstruct_obs_from_h_and_z( + h_t0_to_H=dream_data["h_states_t0_to_H_BxT"], + z_t0_to_H=dream_data["z_states_prior_t0_to_H_BxT"], + dreamer_model=dreamer_model, + obs_dims_shape=obs_dims_shape, + framework=framework, + ) + func = ( + create_cartpole_dream_image + if env.startswith("CartPole") + else create_frozenlake_dream_image + ) + # Take 0th dreamed trajectory and produce series of images. + for b in batch_indices: + images = [] + for t in range(len(dreamed_obs_H_B) - 1): + images.append( + func( + dreamed_obs=dreamed_obs_H_B[t][b], + dreamed_V=dream_data["values_dreamed_t0_to_H_BxT"][t][b], + dreamed_a=(dream_data["actions_ints_dreamed_t0_to_H_BxT"][t][b]), + dreamed_r_tp1=(dream_data["rewards_dreamed_t0_to_H_BxT"][t + 1][b]), + # `DISAGREE_intrinsic_rewards_H_B` are shifted by 1 already + # (from t1 to H, not t0 to H like all other data here). + dreamed_ri_tp1=( + results["DISAGREE_intrinsic_rewards_H_BxT"][t][b] + if "DISAGREE_intrinsic_rewards_H_BxT" in results + else None + ), + dreamed_c_tp1=( + dream_data["continues_dreamed_t0_to_H_BxT"][t + 1][b] + ), + value_target=results["VALUE_TARGETS_H_BxT"][t][b], + initial_h=dream_data["h_states_t0_to_H_BxT"][t][b], + as_tensor=True, + ).numpy() + ) + # Concat images along width-axis (so they show as a "film sequence" next to each + # other). + results.update( + { + f"dreamed_trajectories{('_'+desc) if desc else ''}_B{b}": ( + np.concatenate(images, axis=1) + ), + } + ) + + +def report_predicted_vs_sampled_obs( + *, + metrics, + sample, + batch_size_B, + batch_length_T, + symlog_obs: bool = True, + do_report: bool = True, +): + """Summarizes sampled data (from the replay buffer) vs world-model predictions. + + World model predictions are based on the posterior states (z computed from actual + observation encoder input + the current h-states). + + Observations: Computes MSE (sampled vs predicted/recreated) over all features. + For image observations, also creates direct image comparisons (sampled images + vs predicted (posterior) ones). + Rewards: Compute MSE (sampled vs predicted). + Continues: Compute MSE (sampled vs predicted). + + Args: + metrics: The MetricsLogger object of the DreamerV3 algo. + sample: The sampled data (dict) from the replay buffer. Already tf-tensor + converted. + batch_size_B: The batch size (B). This is the number of trajectories sampled + from the buffer. + batch_length_T: The batch length (T). This is the length of an individual + trajectory sampled from the buffer. + do_report: Whether to actually log the report (default). If this is set to + False, this function serves as a clean-up on the given metrics, making sure + they do NOT contain anymore any (spacious) data relevant for producing + the report/videos. + """ + fwd_output_key = ( + LEARNER_RESULTS, + DEFAULT_MODULE_ID, + "WORLD_MODEL_fwd_out_obs_distribution_means_b0xT", + ) + # logged as a non-reduced item (still a list) + predicted_observation_means_single_example = metrics.peek( + fwd_output_key, default=[None] + )[-1] + metrics.delete(fwd_output_key, key_error=False) + + final_result_key = ( + f"WORLD_MODEL_sampled_vs_predicted_posterior_b0x{batch_length_T}_videos" + ) + if not do_report: + metrics.delete(final_result_key, key_error=False) + return + + _report_obs( + metrics=metrics, + computed_float_obs_B_T_dims=np.reshape( + predicted_observation_means_single_example, + # WandB videos need to be channels first. + (1, batch_length_T) + sample[Columns.OBS].shape[2:], + ), + sampled_obs_B_T_dims=sample[Columns.OBS][0:1], + metrics_key=final_result_key, + symlog_obs=symlog_obs, + ) + + +def report_dreamed_eval_trajectory_vs_samples( + *, + metrics, + sample, + burn_in_T, + dreamed_T, + dreamer_model, + symlog_obs: bool = True, + do_report: bool = True, + framework="torch", +) -> None: + """Logs dreamed observations, rewards, continues and compares them vs sampled data. + + For obs, we'll try to create videos (side-by-side comparison) of the dreamed, + recreated-from-prior obs vs the sampled ones (over dreamed_T timesteps). + + Args: + metrics: The MetricsLogger object of the DreamerV3 algo. + sample: The sampled data (dict) from the replay buffer. Already tf-tensor + converted. + burn_in_T: The number of burn-in timesteps (these will be skipped over in the + reported video comparisons and MSEs). + dreamed_T: The number of timesteps to produce dreamed data for. + dreamer_model: The DreamerModel to use to create observation vectors/images + from dreamed h- and (prior) z-states. + symlog_obs: Whether to inverse-symlog the computed observations or not. Set this + to True for environments, in which we should symlog the observations. + do_report: Whether to actually log the report (default). If this is set to + False, this function serves as a clean-up on the given metrics, making sure + they do NOT contain anymore any (spacious) data relevant for producing + the report/videos. + """ + dream_data = metrics.peek( + (LEARNER_RESULTS, DEFAULT_MODULE_ID, "dream_data"), + default={}, + ) + metrics.delete(LEARNER_RESULTS, DEFAULT_MODULE_ID, "dream_data", key_error=False) + + final_result_key_obs = f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_obs" + final_result_key_rew = ( + f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_rewards_MSE" + ) + final_result_key_cont = ( + f"EVALUATION_sampled_vs_dreamed_prior_H{dreamed_T}_continues_MSE" + ) + if not do_report: + metrics.delete(final_result_key_obs, key_error=False) + metrics.delete(final_result_key_rew, key_error=False) + metrics.delete(final_result_key_cont, key_error=False) + return + + # Obs MSE. + dreamed_obs_H_B = reconstruct_obs_from_h_and_z( + h_t0_to_H=dream_data["h_states_t0_to_H_Bx1"][0], # [0] b/c reduce=None (list) + z_t0_to_H=dream_data["z_states_prior_t0_to_H_Bx1"][0], + dreamer_model=dreamer_model, + obs_dims_shape=sample[Columns.OBS].shape[2:], + framework=framework, + ) + t0 = burn_in_T + tH = t0 + dreamed_T + # Observation MSE and - if applicable - images comparisons. + _report_obs( + metrics=metrics, + # WandB videos need to be 5D (B, L, c, h, w) -> transpose/swap H and B axes. + computed_float_obs_B_T_dims=np.swapaxes(dreamed_obs_H_B, 0, 1)[ + 0:1 + ], # for now: only B=1 + sampled_obs_B_T_dims=sample[Columns.OBS][0:1, t0:tH], + metrics_key=final_result_key_obs, + symlog_obs=symlog_obs, + ) + + # Reward MSE. + _report_rewards( + metrics=metrics, + computed_rewards=dream_data["rewards_dreamed_t0_to_H_Bx1"][0], + sampled_rewards=sample[Columns.REWARDS][:, t0:tH], + metrics_key=final_result_key_rew, + ) + + # Continues MSE. + _report_continues( + metrics=metrics, + computed_continues=dream_data["continues_dreamed_t0_to_H_Bx1"][0], + sampled_continues=(1.0 - sample["is_terminated"])[:, t0:tH], + metrics_key=final_result_key_cont, + ) + + +def report_sampling_and_replay_buffer(*, metrics, replay_buffer): + episodes_in_buffer = replay_buffer.get_num_episodes() + ts_in_buffer = replay_buffer.get_num_timesteps() + replayed_steps = replay_buffer.get_sampled_timesteps() + added_steps = replay_buffer.get_added_timesteps() + + # Summarize buffer, sampling, and train ratio stats. + metrics.log_dict( + { + "capacity": replay_buffer.capacity, + "size_num_episodes": episodes_in_buffer, + "size_timesteps": ts_in_buffer, + "replayed_steps": replayed_steps, + "added_steps": added_steps, + }, + key=REPLAY_BUFFER_RESULTS, + window=1, + ) # window=1 b/c these are current (total count/state) values. + + +def _report_obs( + *, + metrics, + computed_float_obs_B_T_dims, + sampled_obs_B_T_dims, + metrics_key, + symlog_obs, +): + """Summarizes computed- vs sampled observations: MSE and (if applicable) images. + + Args: + metrics: The MetricsLogger object of the DreamerV3 algo. + computed_float_obs_B_T_dims: Computed float observations + (not clipped, not cast'd). Shape=(B, T, [dims ...]). + sampled_obs_B_T_dims: Sampled observations (as-is from the environment, meaning + this could be uint8, 0-255 clipped images). Shape=(B, T, [dims ...]). + metrics_key: The metrics key (or key sequence) under which to log ths resulting + video sequence. + symlog_obs: Whether to inverse-symlog the computed observations or not. Set this + to True for environments, in which we should symlog the observations. + + """ + # Videos: Create summary, comparing computed images with actual sampled ones. + # 4=[B, T, w, h] grayscale image; 5=[B, T, w, h, C] RGB image. + if len(sampled_obs_B_T_dims.shape) in [4, 5]: + # WandB videos need to be channels first. + transpose_axes = ( + (0, 1, 4, 2, 3) if len(sampled_obs_B_T_dims.shape) == 5 else (0, 3, 1, 2) + ) + + if symlog_obs: + computed_float_obs_B_T_dims = inverse_symlog(computed_float_obs_B_T_dims) + + # Restore image pixels from normalized (non-symlog'd) data. + if not symlog_obs: + computed_float_obs_B_T_dims = (computed_float_obs_B_T_dims + 1.0) * 128 + sampled_obs_B_T_dims = (sampled_obs_B_T_dims + 1.0) * 128 + sampled_obs_B_T_dims = np.clip(sampled_obs_B_T_dims, 0.0, 255.0).astype( + np.uint8 + ) + sampled_obs_B_T_dims = np.transpose(sampled_obs_B_T_dims, transpose_axes) + computed_images = np.clip(computed_float_obs_B_T_dims, 0.0, 255.0).astype( + np.uint8 + ) + computed_images = np.transpose(computed_images, transpose_axes) + # Concat sampled and computed images along the height axis (3) such that + # real images show below respective predicted ones. + # (B, T, C, h, w) + sampled_vs_computed_images = np.concatenate( + [computed_images, sampled_obs_B_T_dims], + axis=-1, # concat on width axis (looks nicer) + ) + # Add grayscale dim, if necessary. + if len(sampled_obs_B_T_dims.shape) == 2 + 2: + sampled_vs_computed_images = np.expand_dims(sampled_vs_computed_images, -1) + + metrics.log_value( + metrics_key, + sampled_vs_computed_images, + reduce=None, # No reduction, we want the obs tensor to stay in-tact. + window=1, + ) + + +def _report_rewards( + *, + metrics, + computed_rewards, + sampled_rewards, + metrics_key, +): + mse_sampled_vs_computed_rewards = np.mean( + np.square(computed_rewards - sampled_rewards) + ) + mse_sampled_vs_computed_rewards = np.mean(mse_sampled_vs_computed_rewards) + metrics.log_value( + metrics_key, + mse_sampled_vs_computed_rewards, + window=1, + ) + + +def _report_continues( + *, + metrics, + computed_continues, + sampled_continues, + metrics_key, +): + # Continue MSE. + mse_sampled_vs_computed_continues = np.mean( + np.square( + computed_continues - sampled_continues.astype(computed_continues.dtype) + ) + ) + metrics.log_value( + metrics_key, + mse_sampled_vs_computed_continues, + window=1, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..913c1b77198e0ee941198fdfefbd8bc29ef875cf --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__init__.py @@ -0,0 +1,23 @@ +from ray.rllib.algorithms.impala.impala import ( + IMPALA, + IMPALAConfig, + Impala, + ImpalaConfig, +) +from ray.rllib.algorithms.impala.impala_tf_policy import ( + ImpalaTF1Policy, + ImpalaTF2Policy, +) +from ray.rllib.algorithms.impala.impala_torch_policy import ImpalaTorchPolicy + +__all__ = [ + "IMPALA", + "IMPALAConfig", + # @OldAPIStack + "ImpalaTF1Policy", + "ImpalaTF2Policy", + "ImpalaTorchPolicy", + # Deprecated names (lowercase) + "ImpalaConfig", + "Impala", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c1676ec6dd734f8c0375b14bb15504e9536b743 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a69353656329c85d7524dd5cd68b5e30537f94d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_tf_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_tf_policy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0a70eace69986b103bdfe025adc200952a75cf1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_tf_policy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_torch_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_torch_policy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb00d80a8aa8864f2f25fafbc1dfe38e3653f229 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/impala_torch_policy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_tf.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f42b72d297ff8c319eb74540c659864cf6f701e5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_tf.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala.py new file mode 100644 index 0000000000000000000000000000000000000000..ad1436ac81ee75ca3776d671556aacdb5371c650 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala.py @@ -0,0 +1,1362 @@ +import copy +import functools +import logging +import queue +import time +from typing import Dict, List, Optional, Set, Tuple, Type, Union + +import numpy as np +import tree # pip install dm_tree + +import ray +from ray import ObjectRef +from ray.rllib import SampleBatch +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided +from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate +from ray.rllib.core import ( + COMPONENT_ENV_TO_MODULE_CONNECTOR, + COMPONENT_MODULE_TO_ENV_CONNECTOR, +) +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.execution.buffers.mixin_replay_buffer import MixInMultiAgentReplayBuffer +from ray.rllib.execution.learner_thread import LearnerThread +from ray.rllib.execution.multi_gpu_learner_thread import MultiGPULearnerThread +from ray.rllib.policy.policy import Policy +from ray.rllib.policy.sample_batch import concat_samples +from ray.rllib.utils.annotations import OldAPIStack, override +from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning +from ray.rllib.utils.metrics import ( + ALL_MODULES, + ENV_RUNNER_RESULTS, + LEARNER_GROUP, + LEARNER_RESULTS, + LEARNER_UPDATE_TIMER, + MEAN_NUM_EPISODE_LISTS_RECEIVED, + MEAN_NUM_LEARNER_GROUP_RESULTS_RECEIVED, + MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED, + NUM_AGENT_STEPS_SAMPLED, + NUM_AGENT_STEPS_TRAINED, + NUM_ENV_STEPS_SAMPLED, + NUM_ENV_STEPS_SAMPLED_LIFETIME, + NUM_ENV_STEPS_TRAINED, + NUM_ENV_STEPS_TRAINED_LIFETIME, + NUM_MODULE_STEPS_TRAINED, + NUM_SYNCH_WORKER_WEIGHTS, + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, + SYNCH_WORKER_WEIGHTS_TIMER, + SAMPLE_TIMER, + TIMERS, +) +from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder +from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ReplayMode +from ray.rllib.utils.replay_buffers.replay_buffer import _ALL_POLICIES +from ray.rllib.utils.schedules.scheduler import Scheduler +from ray.rllib.utils.typing import ( + LearningRateOrSchedule, + PartialAlgorithmConfigDict, + PolicyID, + ResultDict, + SampleBatchType, +) +from ray.tune.execution.placement_groups import PlacementGroupFactory + + +logger = logging.getLogger(__name__) + + +LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff" + + +class IMPALAConfig(AlgorithmConfig): + """Defines a configuration class from which an Impala can be built. + + .. testcode:: + + from ray.rllib.algorithms.impala import IMPALAConfig + + config = ( + IMPALAConfig() + .environment("CartPole-v1") + .env_runners(num_env_runners=1) + .training(lr=0.0003, train_batch_size_per_learner=512) + .learners(num_learners=1) + ) + # Build a Algorithm object from the config and run 1 training iteration. + algo = config.build() + algo.train() + del algo + + .. testcode:: + + from ray.rllib.algorithms.impala import IMPALAConfig + from ray import air + from ray import tune + + config = ( + IMPALAConfig() + .environment("CartPole-v1") + .env_runners(num_env_runners=1) + .training(lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0) + .learners(num_learners=1) + ) + # Run with tune. + tune.Tuner( + "IMPALA", + param_space=config, + run_config=air.RunConfig(stop={"training_iteration": 1}), + ).fit() + """ + + def __init__(self, algo_class=None): + """Initializes a IMPALAConfig instance.""" + self.exploration_config = { # @OldAPIstack + # The Exploration class to use. In the simplest case, this is the name + # (str) of any class present in the `rllib.utils.exploration` package. + # You can also provide the python class directly or the full location + # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. + # EpsilonGreedy"). + "type": "StochasticSampling", + # Add constructor kwargs here (if any). + } + + super().__init__(algo_class=algo_class or IMPALA) + + # fmt: off + # __sphinx_doc_begin__ + + # IMPALA specific settings: + self.vtrace = True + self.vtrace_clip_rho_threshold = 1.0 + self.vtrace_clip_pg_rho_threshold = 1.0 + self.learner_queue_size = 3 + self.timeout_s_sampler_manager = 0.0 + self.timeout_s_aggregator_manager = 0.0 + self.broadcast_interval = 1 + self.num_gpu_loader_threads = 8 + + self.grad_clip = 40.0 + # Note: Only when using enable_rl_module_and_learner=True can the clipping mode + # be configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" + + self.vf_loss_coeff = 0.5 + self.entropy_coeff = 0.01 + + # Override some of AlgorithmConfig's default values with IMPALA-specific values. + self.num_learners = 1 + self.num_aggregator_actors_per_learner = 0 + self.rollout_fragment_length = 50 + self.train_batch_size = 500 # @OldAPIstack + self.num_env_runners = 2 + self.lr = 0.0005 + self.min_time_s_per_iteration = 10 + # __sphinx_doc_end__ + # fmt: on + + # IMPALA takes care of its own EnvRunner (weights, connector, metrics) synching. + self._dont_auto_sync_env_runner_states = True + + self.lr_schedule = None # @OldAPIStack + self.entropy_coeff_schedule = None # @OldAPIStack + self.num_multi_gpu_tower_stacks = 1 # @OldAPIstack + self.minibatch_buffer_size = 1 # @OldAPIstack + self.replay_proportion = 0.0 # @OldAPIstack + self.replay_buffer_num_slots = 0 # @OldAPIstack + self.learner_queue_timeout = 300 # @OldAPIstack + self.opt_type = "adam" # @OldAPIstack + self.decay = 0.99 # @OldAPIstack + self.momentum = 0.0 # @OldAPIstack + self.epsilon = 0.1 # @OldAPIstack + self._separate_vf_optimizer = False # @OldAPIstack + self._lr_vf = 0.0005 # @OldAPIstack + self.num_gpus = 1 # @OldAPIstack + self._tf_policy_handles_more_than_one_loss = True # @OldAPIstack + + # Deprecated settings. + self.num_aggregation_workers = DEPRECATED_VALUE + self.max_requests_in_flight_per_aggregator_worker = DEPRECATED_VALUE + + @override(AlgorithmConfig) + def training( + self, + *, + vtrace: Optional[bool] = NotProvided, + vtrace_clip_rho_threshold: Optional[float] = NotProvided, + vtrace_clip_pg_rho_threshold: Optional[float] = NotProvided, + num_gpu_loader_threads: Optional[int] = NotProvided, + num_multi_gpu_tower_stacks: Optional[int] = NotProvided, + minibatch_buffer_size: Optional[int] = NotProvided, + replay_proportion: Optional[float] = NotProvided, + replay_buffer_num_slots: Optional[int] = NotProvided, + learner_queue_size: Optional[int] = NotProvided, + learner_queue_timeout: Optional[float] = NotProvided, + timeout_s_sampler_manager: Optional[float] = NotProvided, + timeout_s_aggregator_manager: Optional[float] = NotProvided, + broadcast_interval: Optional[int] = NotProvided, + grad_clip: Optional[float] = NotProvided, + opt_type: Optional[str] = NotProvided, + lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, + decay: Optional[float] = NotProvided, + momentum: Optional[float] = NotProvided, + epsilon: Optional[float] = NotProvided, + vf_loss_coeff: Optional[float] = NotProvided, + entropy_coeff: Optional[LearningRateOrSchedule] = NotProvided, + entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, + _separate_vf_optimizer: Optional[bool] = NotProvided, + _lr_vf: Optional[float] = NotProvided, + # Deprecated args. + num_aggregation_workers=DEPRECATED_VALUE, + max_requests_in_flight_per_aggregator_worker=DEPRECATED_VALUE, + **kwargs, + ) -> "IMPALAConfig": + """Sets the training related configuration. + + Args: + vtrace: V-trace params (see vtrace_tf/torch.py). + vtrace_clip_rho_threshold: + vtrace_clip_pg_rho_threshold: + num_gpu_loader_threads: The number of GPU-loader threads (per Learner + worker), used to load incoming (CPU) batches to the GPU, if applicable. + The incoming batches are produced by each Learner's LearnerConnector + pipeline. After loading the batches on the GPU, the threads place them + on yet another queue for the Learner thread (only one per Learner + worker) to pick up and perform `forward_train/loss` computations. + num_multi_gpu_tower_stacks: For each stack of multi-GPU towers, how many + slots should we reserve for parallel data loading? Set this to >1 to + load data into GPUs in parallel. This will increase GPU memory usage + proportionally with the number of stacks. + Example: + 2 GPUs and `num_multi_gpu_tower_stacks=3`: + - One tower stack consists of 2 GPUs, each with a copy of the + model/graph. + - Each of the stacks will create 3 slots for batch data on each of its + GPUs, increasing memory requirements on each GPU by 3x. + - This enables us to preload data into these stacks while another stack + is performing gradient calculations. + minibatch_buffer_size: How many train batches should be retained for + minibatching. This conf only has an effect if `num_epochs > 1`. + replay_proportion: Set >0 to enable experience replay. Saved samples will + be replayed with a p:1 proportion to new data samples. + replay_buffer_num_slots: Number of sample batches to store for replay. + The number of transitions saved total will be + (replay_buffer_num_slots * rollout_fragment_length). + learner_queue_size: Max queue size for train batches feeding into the + learner. + learner_queue_timeout: Wait for train batches to be available in minibatch + buffer queue this many seconds. This may need to be increased e.g. when + training with a slow environment. + timeout_s_sampler_manager: The timeout for waiting for sampling results + for workers -- typically if this is too low, the manager won't be able + to retrieve ready sampling results. + timeout_s_aggregator_manager: The timeout for waiting for replay worker + results -- typically if this is too low, the manager won't be able to + retrieve ready replay requests. + broadcast_interval: Number of training step calls before weights are + broadcasted to rollout workers that are sampled during any iteration. + grad_clip: If specified, clip the global norm of gradients by this amount. + opt_type: Either "adam" or "rmsprop". + lr_schedule: Learning rate schedule. In the format of + [[timestep, lr-value], [timestep, lr-value], ...] + Intermediary timesteps will be assigned to interpolated learning rate + values. A schedule should normally start from timestep 0. + decay: Decay setting for the RMSProp optimizer, in case `opt_type=rmsprop`. + momentum: Momentum setting for the RMSProp optimizer, in case + `opt_type=rmsprop`. + epsilon: Epsilon setting for the RMSProp optimizer, in case + `opt_type=rmsprop`. + vf_loss_coeff: Coefficient for the value function term in the loss function. + entropy_coeff: Coefficient for the entropy regularizer term in the loss + function. + entropy_coeff_schedule: Decay schedule for the entropy regularizer. + _separate_vf_optimizer: Set this to true to have two separate optimizers + optimize the policy-and value networks. Only supported for some + algorithms (APPO, IMPALA) on the old API stack. + _lr_vf: If _separate_vf_optimizer is True, define separate learning rate + for the value network. + + Returns: + This updated AlgorithmConfig object. + """ + if num_aggregation_workers != DEPRECATED_VALUE: + deprecation_warning( + old="config.training(num_aggregation_workers=..)", + help="Aggregator workers are no longer supported on the old API " + "stack! To use aggregation (and GPU pre-loading) on the new API " + "stack, activate the new API stack, then set " + "`config.learners(num_aggregator_actors_per_learner=..)`. Good " + "choices are normally 1 or 2, but this depends on your overall " + "setup, especially your `EnvRunner` throughput.", + error=True, + ) + if max_requests_in_flight_per_aggregator_worker != DEPRECATED_VALUE: + deprecation_warning( + old="config.training(max_requests_in_flight_per_aggregator_worker=..)", + help="Aggregator workers are no longer supported on the old API " + "stack! To use aggregation (and GPU pre-loading) on the new API " + "stack, activate the new API stack and THEN set " + "`config.learners(max_requests_in_flight_per_aggregator_actor=..)" + "`.", + error=True, + ) + + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if vtrace is not NotProvided: + self.vtrace = vtrace + if vtrace_clip_rho_threshold is not NotProvided: + self.vtrace_clip_rho_threshold = vtrace_clip_rho_threshold + if vtrace_clip_pg_rho_threshold is not NotProvided: + self.vtrace_clip_pg_rho_threshold = vtrace_clip_pg_rho_threshold + if num_gpu_loader_threads is not NotProvided: + self.num_gpu_loader_threads = num_gpu_loader_threads + if num_multi_gpu_tower_stacks is not NotProvided: + self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks + if minibatch_buffer_size is not NotProvided: + self.minibatch_buffer_size = minibatch_buffer_size + if replay_proportion is not NotProvided: + self.replay_proportion = replay_proportion + if replay_buffer_num_slots is not NotProvided: + self.replay_buffer_num_slots = replay_buffer_num_slots + if learner_queue_size is not NotProvided: + self.learner_queue_size = learner_queue_size + if learner_queue_timeout is not NotProvided: + self.learner_queue_timeout = learner_queue_timeout + if broadcast_interval is not NotProvided: + self.broadcast_interval = broadcast_interval + if timeout_s_sampler_manager is not NotProvided: + self.timeout_s_sampler_manager = timeout_s_sampler_manager + if timeout_s_aggregator_manager is not NotProvided: + self.timeout_s_aggregator_manager = timeout_s_aggregator_manager + if grad_clip is not NotProvided: + self.grad_clip = grad_clip + if opt_type is not NotProvided: + self.opt_type = opt_type + if lr_schedule is not NotProvided: + self.lr_schedule = lr_schedule + if decay is not NotProvided: + self.decay = decay + if momentum is not NotProvided: + self.momentum = momentum + if epsilon is not NotProvided: + self.epsilon = epsilon + if vf_loss_coeff is not NotProvided: + self.vf_loss_coeff = vf_loss_coeff + if entropy_coeff is not NotProvided: + self.entropy_coeff = entropy_coeff + if entropy_coeff_schedule is not NotProvided: + self.entropy_coeff_schedule = entropy_coeff_schedule + if _separate_vf_optimizer is not NotProvided: + self._separate_vf_optimizer = _separate_vf_optimizer + if _lr_vf is not NotProvided: + self._lr_vf = _lr_vf + + return self + + @override(AlgorithmConfig) + def validate(self) -> None: + # Call the super class' validation method first. + super().validate() + + # IMPALA and APPO need vtrace (A3C Policies no longer exist). + if not self.vtrace: + self._value_error( + "IMPALA and APPO do NOT support vtrace=False anymore! Set " + "`config.training(vtrace=True)`." + ) + + # New API stack checks. + if self.enable_env_runner_and_connector_v2: + # Does NOT support aggregation workers yet or a mixin replay buffer. + if self.replay_ratio != 0.0: + self._value_error( + "The new API stack in combination with the new EnvRunner API " + "does NOT support a mixin replay buffer yet for " + f"{self} (set `config.replay_proportion` to 0.0)!" + ) + # `lr_schedule` checking. + if self.lr_schedule is not None: + self._value_error( + "`lr_schedule` is deprecated and must be None! Use the " + "`lr` setting to setup a schedule." + ) + # Entropy coeff schedule checking. + if self.entropy_coeff_schedule is not None: + self._value_error( + "`entropy_coeff_schedule` is deprecated and must be None! Use the " + "`entropy_coeff` setting to setup a schedule." + ) + Scheduler.validate( + fixed_value_or_schedule=self.entropy_coeff, + setting_name="entropy_coeff", + description="entropy coefficient", + ) + # Learner API specific checks. + # GPU-bound single Learner must be local (faster than remote Learner, + # b/c GPU can update in parallel through the learner thread). + if self.num_gpus_per_learner > 0 and self.num_learners == 1: + self._value_error( + "When running with 1 GPU Learner, this Learner should be local! " + "Set `config.learners(num_learners=0)` to configure a local " + "Learner instance." + ) + # CPU-bound single Learner must be remote (faster than local Learner, + # b/c learner thread would compete with main thread for resources). + elif self.num_gpus_per_learner == 0 and self.num_learners == 0: + self._value_error( + "When running with a CPU Learner, this Learner should be remote! " + "Set `config.learners(num_learners=1)` to configure a single " + "remote Learner instance." + ) + + if self.minibatch_size is not None and not ( + (self.minibatch_size % self.rollout_fragment_length == 0) + and self.minibatch_size <= self.total_train_batch_size + ): + self._value_error( + f"`minibatch_size` ({self._minibatch_size}) must either be None " + "or a multiple of `rollout_fragment_length` " + f"({self.rollout_fragment_length}) while at the same time smaller " + "than or equal to `total_train_batch_size` " + f"({self.total_train_batch_size})!" + ) + # Old API stack checks. + else: + if isinstance(self.entropy_coeff, float) and self.entropy_coeff < 0.0: + self._value_error("`entropy_coeff` must be >= 0.0") + + # If two separate optimizers/loss terms used for tf, must also set + # `_tf_policy_handles_more_than_one_loss` to True. + if ( + self.framework_str in ["tf", "tf2"] + and self._separate_vf_optimizer is True + and self._tf_policy_handles_more_than_one_loss is False + ): + self._value_error( + "`_tf_policy_handles_more_than_one_loss` must be set to True, for " + "TFPolicy to support more than one loss term/optimizer! Try setting " + "config.training(_tf_policy_handles_more_than_one_loss=True)." + ) + + @property + def replay_ratio(self) -> float: + """Returns replay ratio (between 0.0 and 1.0) based off self.replay_proportion. + + Formula: ratio = 1 / proportion + """ + return (1 / self.replay_proportion) if self.replay_proportion > 0 else 0.0 + + @override(AlgorithmConfig) + def get_default_learner_class(self): + if self.framework_str == "torch": + from ray.rllib.algorithms.impala.torch.impala_torch_learner import ( + IMPALATorchLearner, + ) + + return IMPALATorchLearner + elif self.framework_str in ["tf2", "tf"]: + raise ValueError( + "TensorFlow is no longer supported on the new API stack! " + "Use `framework='torch'`." + ) + else: + raise ValueError( + f"The framework {self.framework_str} is not supported. " + "Use `framework='torch'`." + ) + + @override(AlgorithmConfig) + def get_default_rl_module_spec(self) -> RLModuleSpec: + if self.framework_str == "torch": + from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import ( + DefaultPPOTorchRLModule, + ) + + return RLModuleSpec(module_class=DefaultPPOTorchRLModule) + else: + raise ValueError( + f"The framework {self.framework_str} is not supported. " + "Use either 'torch' or 'tf2'." + ) + + @override(AlgorithmConfig) + def build_learner_connector( + self, + input_observation_space, + input_action_space, + device=None, + ): + connector = super().build_learner_connector( + input_observation_space, + input_action_space, + device, + ) + # Extend all episodes by one artificial timestep to allow the value function net + # to compute the bootstrap values (and add a mask to the batch to know, which + # slots to mask out). + if self.add_default_connectors_to_learner_pipeline: + connector.prepend(AddOneTsToEpisodesAndTruncate()) + return connector + + +ImpalaConfig = IMPALAConfig + + +class IMPALA(Algorithm): + """Importance weighted actor/learner architecture (IMPALA) Algorithm + + == Overview of data flow in IMPALA == + 1. Policy evaluation in parallel across `num_env_runners` actors produces + batches of size `rollout_fragment_length * num_envs_per_env_runner`. + 2. If enabled, the replay buffer stores and produces batches of size + `rollout_fragment_length * num_envs_per_env_runner`. + 3. If enabled, the minibatch ring buffer stores and replays batches of + size `train_batch_size` up to `num_epochs` times per batch. + 4. The learner thread executes data parallel SGD across `num_gpus` GPUs + on batches of size `train_batch_size`. + """ + + @classmethod + @override(Algorithm) + def get_default_config(cls) -> AlgorithmConfig: + return IMPALAConfig() + + @classmethod + @override(Algorithm) + def get_default_policy_class( + cls, config: AlgorithmConfig + ) -> Optional[Type[Policy]]: + if config.framework_str == "torch": + from ray.rllib.algorithms.impala.impala_torch_policy import ( + ImpalaTorchPolicy, + ) + + return ImpalaTorchPolicy + + elif config.framework_str == "tf": + from ray.rllib.algorithms.impala.impala_tf_policy import ( + ImpalaTF1Policy, + ) + + return ImpalaTF1Policy + else: + from ray.rllib.algorithms.impala.impala_tf_policy import ( + ImpalaTF2Policy, + ) + + return ImpalaTF2Policy + + @override(Algorithm) + def setup(self, config: AlgorithmConfig): + super().setup(config) + + # Queue of data to be sent to the Learner. + self.data_to_place_on_learner = [] + self._batch_being_built = [] # @OldAPIStack + + # Create extra aggregation workers and assign each rollout worker to + # one of them. + self._episode_packs_being_built = [] + self._ma_batches_being_built: Dict[int, list] = { + i: [] for i in range(self.config.num_learners or 1) + } + + # Create our local mixin buffer. + if not self.config.enable_rl_module_and_learner: + self.local_mixin_buffer = MixInMultiAgentReplayBuffer( + capacity=( + self.config.replay_buffer_num_slots + if self.config.replay_buffer_num_slots > 0 + else 1 + ), + replay_ratio=self.config.replay_ratio, + replay_mode=ReplayMode.LOCKSTEP, + ) + + # This variable is used to keep track of the statistics from the most recent + # update of the learner group + self._results = {} + + if not self.config.enable_rl_module_and_learner: + # Create and start the learner thread. + self._learner_thread = make_learner_thread(self.env_runner, self.config) + self._learner_thread.start() + + @override(Algorithm) + def training_step(self): + # Old API stack. + if not self.config.enable_rl_module_and_learner: + return self._training_step_old_api_stack() + + do_async_updates = self.config.num_learners > 0 + + # Asynchronously request all EnvRunners to sample and return their current + # (e.g. ConnectorV2) states and sampling metrics/stats. + # Note that each item in `episode_refs` is a reference to a list of Episodes. + with self.metrics.log_time((TIMERS, SAMPLE_TIMER)): + ( + episode_refs, + connector_states, + env_runner_metrics, + env_runner_indices_to_update, + ) = self._sample_and_get_connector_states() + # Reduce EnvRunner metrics over the n EnvRunners. + self.metrics.merge_and_log_n_dicts( + env_runner_metrics, + key=ENV_RUNNER_RESULTS, + ) + + # Log the average number of sample results (list of episodes) received. + self.metrics.log_value(MEAN_NUM_EPISODE_LISTS_RECEIVED, len(episode_refs)) + + time.sleep(0.01) + + # "Batch" collected episode refs into groups, such that exactly + # `total_train_batch_size` timesteps are sent to + # `LearnerGroup.update_from_episodes()`. + if self.config.num_aggregator_actors_per_learner > 0: + data_packages_for_aggregators = self._pre_queue_episode_refs( + episode_refs, package_size=self.config.train_batch_size_per_learner + ) + ma_batches_refs_remote_results = ( + self._aggregator_actor_manager.fetch_ready_async_reqs( + timeout_seconds=0.0, + return_obj_refs=True, + tags="batches", + ) + ) + ma_batches_refs = [] + for call_result in ma_batches_refs_remote_results: + ma_batches_refs.append((call_result.actor_id, call_result.get())) + + while data_packages_for_aggregators: + + def _func(actor, p): + return actor.get_batch(p) + + num_agg = self.config.num_aggregator_actors_per_learner * ( + self.config.num_learners or 1 + ) + packs = data_packages_for_aggregators[:num_agg] + self._aggregator_actor_manager.foreach_actor_async( + func=[functools.partial(_func, p=p) for p in packs], + tag="batches", + ) + data_packages_for_aggregators = data_packages_for_aggregators[num_agg:] + + # Get n lists of m ObjRef[MABatch] (m=num_learners) to perform n calls to + # all learner workers with the already GPU-located batches. + data_packages_for_learner_group = self._pre_queue_batch_refs( + ma_batches_refs + ) + + else: + data_packages_for_learner_group = self._pre_queue_episode_refs( + episode_refs, package_size=self.config.total_train_batch_size + ) + + time.sleep(0.01) + + # Call the LearnerGroup's `update_from_episodes` method. + with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)): + self.metrics.log_value( + key=MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED, + value=len(data_packages_for_learner_group), + ) + rl_module_state = None + num_learner_group_results_received = 0 + + for batch_ref_or_episode_list_ref in data_packages_for_learner_group: + return_state = ( + self.metrics.peek( + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, + default=0, + ) + >= self.config.broadcast_interval + ) + timesteps = { + NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek( + (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME), default=0 + ), + NUM_ENV_STEPS_TRAINED_LIFETIME: self.metrics.peek( + (LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED_LIFETIME), + default=0, + ), + } + if self.config.num_aggregator_actors_per_learner > 0: + learner_results = self.learner_group.update_from_batch( + batch=batch_ref_or_episode_list_ref, + async_update=do_async_updates, + return_state=return_state, + timesteps=timesteps, + num_epochs=self.config.num_epochs, + minibatch_size=self.config.minibatch_size, + shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch, + ) + else: + learner_results = self.learner_group.update_from_episodes( + episodes=batch_ref_or_episode_list_ref, + async_update=do_async_updates, + return_state=return_state, + timesteps=timesteps, + num_epochs=self.config.num_epochs, + minibatch_size=self.config.minibatch_size, + shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch, + ) + # TODO (sven): Rename this metric into a more fitting name: ex. + # `NUM_LEARNER_UPDATED_SINCE_LAST_WEIGHTS_SYNC` + self.metrics.log_value( + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, + 1, + reduce="sum", + ) + if not do_async_updates: + learner_results = [learner_results] + + for results_from_n_learners in learner_results: + if not results_from_n_learners[0]: + continue + num_learner_group_results_received += 1 + for r in results_from_n_learners: + rl_module_state = r.pop( + "_rl_module_state_after_update", rl_module_state + ) + self.metrics.merge_and_log_n_dicts( + stats_dicts=results_from_n_learners, + key=LEARNER_RESULTS, + ) + self.metrics.log_value( + key=MEAN_NUM_LEARNER_GROUP_RESULTS_RECEIVED, + value=num_learner_group_results_received, + ) + + # Update LearnerGroup's own stats. + self.metrics.log_dict(self.learner_group.get_stats(), key=LEARNER_GROUP) + + time.sleep(0.01) + + # Figure out, whether we should sync/broadcast the (remote) EnvRunner states. + # Note: `learner_results` is a List of n (num async calls) Lists of m + # (num Learner workers) ResultDicts each. + if rl_module_state is not None: + self.metrics.set_value( + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, 0 + ) + self.metrics.log_value(NUM_SYNCH_WORKER_WEIGHTS, 1, reduce="sum") + with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)): + self.env_runner_group.sync_env_runner_states( + config=self.config, + connector_states=connector_states, + rl_module_state=rl_module_state, + ) + + time.sleep(0.01) + + def _sample_and_get_connector_states(self): + def _remote_sample_get_state_and_metrics(_worker): + _episodes = _worker.sample() + # Get the EnvRunner's connector states. + _connector_states = _worker.get_state( + components=[ + COMPONENT_ENV_TO_MODULE_CONNECTOR, + COMPONENT_MODULE_TO_ENV_CONNECTOR, + ] + ) + _metrics = _worker.get_metrics() + # Return episode lists by reference so we don't have to send them to the + # main algo process, but to the Learner workers directly. + return ray.put(_episodes), _connector_states, _metrics + + env_runner_indices_to_update = set() + episode_refs = [] + connector_states = [] + env_runner_metrics = [] + num_healthy_remote_workers = self.env_runner_group.num_healthy_remote_workers() + + # Perform asynchronous sampling on all (healthy) remote rollout workers. + if num_healthy_remote_workers > 0: + async_results: List[ + Tuple[int, ObjectRef] + ] = self.env_runner_group.fetch_ready_async_reqs( + timeout_seconds=self.config.timeout_s_sampler_manager, + return_obj_refs=False, + ) + self.env_runner_group.foreach_env_runner_async( + _remote_sample_get_state_and_metrics + ) + # Get results from the n different async calls and store those EnvRunner + # indices we should update. + results = [] + for r in async_results: + env_runner_indices_to_update.add(r[0]) + results.append(r[1]) + + for (episodes, states, metrics) in results: + episode_refs.append(episodes) + connector_states.append(states) + env_runner_metrics.append(metrics) + # Sample from the local EnvRunner. + else: + episodes = self.env_runner.sample() + env_runner_metrics = [self.env_runner.get_metrics()] + episode_refs = [ray.put(episodes)] + connector_states = [ + self.env_runner.get_state( + components=[ + COMPONENT_ENV_TO_MODULE_CONNECTOR, + COMPONENT_MODULE_TO_ENV_CONNECTOR, + ] + ) + ] + + return ( + episode_refs, + connector_states, + env_runner_metrics, + env_runner_indices_to_update, + ) + + def _pre_queue_episode_refs( + self, episode_refs: List[ObjectRef], package_size: int + ) -> List[List[ObjectRef]]: + # Each element in this list is itself a list of ObjRef[Episodes]. + # Each ObjRef was returned by one EnvRunner from a single sample() call. + episodes: List[List[ObjectRef]] = [] + + for ref in episode_refs: + self._episode_packs_being_built.append(ref) + if ( + len(self._episode_packs_being_built) + * self.config.num_envs_per_env_runner + * self.config.get_rollout_fragment_length() + >= package_size + ): + episodes.append(self._episode_packs_being_built) + self._episode_packs_being_built = [] + + return episodes + + def _pre_queue_batch_refs( + self, batch_refs: List[Tuple[int, ObjectRef]] + ) -> List[List[ObjectRef]]: + # `batch_refs` is a list of tuple(aggregator_actor_id, ObjRef[MABatch]). + + # Each ObjRef[MABatch] was returned by one AggregatorActor from a single + # `get_batch()` call and the underlying MABatch is already located on a + # particular GPU (matching one particular Learner). + for agg_actor_id, ma_batch_ref in batch_refs: + learner_actor_id = self._aggregator_actor_to_learner[agg_actor_id] + self._ma_batches_being_built[learner_actor_id].append(ma_batch_ref) + + # Construct a n-group of batches (n=num_learners) as long as we still have + # at least one batch per learner in our queue. + batch_refs_for_learner_group: List[List[ObjectRef]] = [] + while all( + learner_list for learner_list in self._ma_batches_being_built.values() + ): + batch_refs_for_learner_group.append( + [ + learner_list.pop(0) + for learner_list in self._ma_batches_being_built.values() + ] + ) + + return batch_refs_for_learner_group + + @classmethod + @override(Algorithm) + def default_resource_request( + cls, + config: Union[AlgorithmConfig, PartialAlgorithmConfigDict], + ): + if isinstance(config, AlgorithmConfig): + cf: IMPALAConfig = config + else: + cf: IMPALAConfig = cls.get_default_config().update_from_dict(config) + + eval_config = cf.get_evaluation_config_object() + + bundles = [] + + # Main process (old API stack). + if not cf.enable_rl_module_and_learner: + bundles.append( + { + "CPU": cf.num_cpus_for_main_process, + "GPU": 0 if cf._fake_gpus else cf.num_gpus, + } + ) + # Main process (no local learner). + elif cf.num_learners > 0: + bundles.append({"CPU": cf.num_cpus_for_main_process}) + # Main process (local learner). + else: + bundles.append( + { + "CPU": max( + cf.num_cpus_for_main_process, + cf.num_cpus_per_learner if cf.num_gpus_per_learner == 0 else 0, + ), + "GPU": max( + 0, + cf.num_gpus_per_learner + - 0.01 * cf.num_aggregator_actors_per_learner, + ), + } + ) + # Aggregation actors (for the local learner). + bundles += [ + {"CPU": 1, "GPU": 0.01 if cf.num_gpus_per_learner > 0 else 0} + for _ in range(cf.num_aggregator_actors_per_learner) + ] + + # EnvRunners. + bundles += [ + { + "CPU": cf.num_cpus_per_env_runner, + "GPU": cf.num_gpus_per_env_runner, + **cf.custom_resources_per_env_runner, + } + for _ in range(cf.num_env_runners) + ] + + # Evaluation (remote) workers. + bundles += ( + [ + { + # Note: The local eval worker is located on the driver + # CPU or not even created iff >0 eval workers. + "CPU": eval_config.num_cpus_per_env_runner, + "GPU": eval_config.num_gpus_per_env_runner, + **eval_config.custom_resources_per_env_runner, + } + for _ in range(cf.evaluation_num_env_runners) + ] + if cf.evaluation_interval + else [] + ) + # TODO (avnishn): Remove this once we have a way to extend placement group + # factories. + # Only if we have actual (remote) learner workers. In case of a local learner, + # the resource has already been taken care of above. + if cf.enable_rl_module_and_learner and cf.num_learners > 0: + bundles += cls._get_learner_bundles(cf) + + # Return PlacementGroupFactory containing all needed resources + # (already properly defined as device bundles). + return PlacementGroupFactory( + bundles=bundles, + strategy=cf.placement_strategy, + ) + + @OldAPIStack + def _training_step_old_api_stack(self): + # First, check, whether our learner thread is still healthy. + if not self._learner_thread.is_alive(): + raise RuntimeError("The learner thread died while training!") + + # Get sampled SampleBatches from our workers (by ray references if we use + # tree-aggregation). + unprocessed_sample_batches = self._get_samples_from_workers_old_api_stack( + return_object_refs=False, + ) + # Tag workers that actually produced ready sample batches this iteration. + # Those workers will have to get updated at the end of the iteration. + workers_that_need_updates = { + worker_id for worker_id, _ in unprocessed_sample_batches + } + + # Resolve collected batches here on local process (using the mixin buffer). + batches = self._process_experiences_old_api_stack(unprocessed_sample_batches) + + # Increase sampling counters now that we have the actual SampleBatches on + # the local process (and can measure their sizes). + for batch in batches: + self._counters[NUM_ENV_STEPS_SAMPLED] += batch.count + self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps() + # Concatenate single batches into batches of size `total_train_batch_size`. + self._concatenate_batches_and_pre_queue(batches) + # Move train batches (of size `total_train_batch_size`) onto learner queue. + self._place_processed_samples_on_learner_thread_queue() + # Extract most recent train results from learner thread. + train_results = self._process_trained_results() + + # Sync worker weights (only those policies that were actually updated). + with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: + pids = list(train_results.keys()) + self._update_workers_old_api_stack( + workers_that_need_updates=workers_that_need_updates, + policy_ids=pids, + ) + + # With a training step done, try to bring any aggregators back to life + # if necessary. + # Aggregation workers are stateless, so we do not need to restore any + # state here. + if self._aggregator_actor_manager: + self._aggregator_actor_manager.probe_unhealthy_actors( + timeout_seconds=self.config.env_runner_health_probe_timeout_s, + mark_healthy=True, + ) + + return train_results + + @OldAPIStack + def _get_samples_from_workers_old_api_stack( + self, + return_object_refs: Optional[bool] = False, + ) -> List[Tuple[int, Union[ObjectRef, SampleBatchType]]]: + """Get samples from rollout workers for training. + + Args: + return_object_refs: If True, return ObjectRefs instead of the samples + directly. This is useful when using aggregator workers so that data + collected on rollout workers is directly de referenced on the aggregator + workers instead of first in the driver and then on the aggregator + workers. + + Returns: + a list of tuples of (worker_index, sample batch or ObjectRef to a sample + batch) + + """ + with self._timers[SAMPLE_TIMER]: + # Sample from healthy remote workers by default. If there is no healthy + # worker (either because they have all died, or because there was none to + # begin) check if the local_worker exists. If the local worker has an + # env_instance (either because there are no remote workers or + # self.config.create_env_on_local_worker == True), then sample from the + # local worker. Otherwise just return an empty list. + if self.env_runner_group.num_healthy_remote_workers() > 0: + # Perform asynchronous sampling on all (remote) rollout workers. + self.env_runner_group.foreach_env_runner_async( + lambda worker: worker.sample() + ) + sample_batches: List[ + Tuple[int, ObjectRef] + ] = self.env_runner_group.fetch_ready_async_reqs( + timeout_seconds=self.config.timeout_s_sampler_manager, + return_obj_refs=return_object_refs, + ) + elif self.config.num_env_runners == 0 or ( + self.env_runner and self.env_runner.async_env is not None + ): + # Sampling from the local worker + sample_batch = self.env_runner.sample() + if return_object_refs: + sample_batch = ray.put(sample_batch) + sample_batches = [(0, sample_batch)] + else: + # Not much we can do. Return empty list and wait. + sample_batches = [] + + return sample_batches + + @OldAPIStack + def _process_experiences_old_api_stack( + self, + worker_to_sample_batches: List[Tuple[int, SampleBatch]], + ) -> List[SampleBatchType]: + """Process sample batches directly on the driver, for training. + + Args: + worker_to_sample_batches: List of (worker_id, sample_batch) tuples. + + Returns: + Batches that have been processed by the mixin buffer. + + """ + batches = [b for _, b in worker_to_sample_batches] + processed_batches = [] + + for batch in batches: + assert not isinstance( + batch, ObjectRef + ), "`IMPALA._process_experiences_old_api_stack` can not handle ObjectRefs!" + batch = batch.decompress_if_needed() + # Only make a pass through the buffer, if replay proportion is > 0.0 (and + # we actually have one). + self.local_mixin_buffer.add(batch) + batch = self.local_mixin_buffer.replay(_ALL_POLICIES) + if batch: + processed_batches.append(batch) + + return processed_batches + + @OldAPIStack + def _concatenate_batches_and_pre_queue(self, batches: List[SampleBatch]) -> None: + """Concatenate batches that are being returned from rollout workers + + Args: + batches: List of batches of experiences from EnvRunners. + """ + + def aggregate_into_larger_batch(): + if ( + sum(b.count for b in self._batch_being_built) + >= self.config.total_train_batch_size + ): + batch_to_add = concat_samples(self._batch_being_built) + self.data_to_place_on_learner.append(batch_to_add) + self._batch_being_built = [] + + for batch in batches: + # TODO (sven): Strange bug after a RolloutWorker crash and proper + # restart. The bug is related to (old, non-V2) connectors being used and + # seems to happen inside the AgentCollector's `add_action_reward_next_obs` + # method, at the end of which the number of vf_preds (and all other + # extra action outs) in the batch is one smaller than the number of obs/ + # actions/rewards, which then leads to a malformed train batch. + # IMPALA/APPO crash inside the loss function (during v-trace operations) + # b/c of the resulting shape mismatch. The following if-block prevents + # this from happening and it can be removed once we are on the new API + # stack for good (and use the new connectors and also no longer + # AgentCollectors, RolloutWorkers, Policies, TrajectoryView API, etc..): + if ( + self.config.batch_mode == "truncate_episodes" + and self.config.restart_failed_env_runners + ): + if any( + SampleBatch.VF_PREDS in pb + and ( + pb[SampleBatch.VF_PREDS].shape[0] + != pb[SampleBatch.REWARDS].shape[0] + ) + for pb in batch.policy_batches.values() + ): + continue + + self._batch_being_built.append(batch) + aggregate_into_larger_batch() + + @OldAPIStack + def _learn_on_processed_samples(self) -> ResultDict: + """Update the learner group with the latest batch of processed samples. + + Returns: + Aggregated results from the learner group after an update is completed. + + """ + # Nothing on the queue -> Don't send requests to learner group + # or no results ready (from previous `self.learner_group.update()` calls) for + # reducing. + if not self.data_to_place_on_learner: + return {} + + # There are batches on the queue -> Send them all to the learner group. + batches = self.data_to_place_on_learner[:] + self.data_to_place_on_learner.clear() + + # If there are no learner workers and learning is directly on the driver + # Then we can't do async updates, so we need to block. + async_update = self.config.num_learners > 0 + results = [] + for batch in batches: + results = self.learner_group.update_from_batch( + batch=batch, + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) + ), + }, + async_update=async_update, + num_epochs=self.config.num_epochs, + minibatch_size=self.config.minibatch_size, + ) + if not async_update: + results = [results] + + for r in results: + self._counters[NUM_ENV_STEPS_TRAINED] += r[ALL_MODULES].pop( + NUM_ENV_STEPS_TRAINED + ) + self._counters[NUM_AGENT_STEPS_TRAINED] += r[ALL_MODULES].pop( + NUM_MODULE_STEPS_TRAINED + ) + + self._counters.update(self.learner_group.get_stats()) + # If there are results, reduce-mean over each individual value and return. + if results: + return tree.map_structure(lambda *x: np.mean(x), *results) + + # Nothing on the queue -> Don't send requests to learner group + # or no results ready (from previous `self.learner_group.update_from_batch()` + # calls) for reducing. + return {} + + @OldAPIStack + def _place_processed_samples_on_learner_thread_queue(self) -> None: + """Place processed samples on the learner queue for training.""" + for i, batch in enumerate(self.data_to_place_on_learner): + try: + self._learner_thread.inqueue.put( + batch, + # Setting block = True for the very last item in our list prevents + # the learner thread, this main thread, and the GPU loader threads + # from thrashing when there are more samples than the learner can + # reasonably process. + # see https://github.com/ray-project/ray/pull/26581#issuecomment-1187877674 # noqa + block=i == len(self.data_to_place_on_learner) - 1, + ) + self._counters["num_samples_added_to_queue"] += ( + batch.agent_steps() + if self.config.count_steps_by == "agent_steps" + else batch.count + ) + except queue.Full: + self._counters["num_times_learner_queue_full"] += 1 + + self.data_to_place_on_learner.clear() + + @OldAPIStack + def _process_trained_results(self) -> ResultDict: + """Process training results that are outputed by the learner thread. + + Returns: + Aggregated results from the learner thread after an update is completed. + + """ + # Get learner outputs/stats from output queue. + num_env_steps_trained = 0 + num_agent_steps_trained = 0 + learner_infos = [] + # Loop through output queue and update our counts. + for _ in range(self._learner_thread.outqueue.qsize()): + ( + env_steps, + agent_steps, + learner_results, + ) = self._learner_thread.outqueue.get(timeout=0.001) + num_env_steps_trained += env_steps + num_agent_steps_trained += agent_steps + if learner_results: + learner_infos.append(learner_results) + # Nothing new happened since last time, use the same learner stats. + if not learner_infos: + final_learner_info = copy.deepcopy(self._learner_thread.learner_info) + # Accumulate learner stats using the `LearnerInfoBuilder` utility. + else: + builder = LearnerInfoBuilder() + for info in learner_infos: + builder.add_learn_on_batch_results_multi_agent(info) + final_learner_info = builder.finalize() + + # Update the steps trained counters. + self._counters[NUM_ENV_STEPS_TRAINED] += num_env_steps_trained + self._counters[NUM_AGENT_STEPS_TRAINED] += num_agent_steps_trained + + return final_learner_info + + @OldAPIStack + def _update_workers_old_api_stack( + self, + workers_that_need_updates: Set[int], + policy_ids: Optional[List[PolicyID]] = None, + ) -> None: + """Updates all RolloutWorkers that require updating. + + Updates only if NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS has been + reached and the worker has sent samples in this iteration. Also only updates + those policies, whose IDs are given via `policies` (if None, update all + policies). + + Args: + workers_that_need_updates: Set of worker IDs that need to be updated. + policy_ids: Optional list of Policy IDs to update. If None, will update all + policies on the to-be-updated workers. + """ + # Update global vars of the local worker. + if self.config.policy_states_are_swappable: + self.env_runner.lock() + global_vars = { + "timestep": self._counters[NUM_AGENT_STEPS_TRAINED], + "num_grad_updates_per_policy": { + pid: self.env_runner.policy_map[pid].num_grad_updates + for pid in policy_ids or [] + }, + } + self.env_runner.set_global_vars(global_vars, policy_ids=policy_ids) + if self.config.policy_states_are_swappable: + self.env_runner.unlock() + + # Only need to update workers if there are remote workers. + self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] += 1 + if ( + self.env_runner_group.num_remote_workers() > 0 + and self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] + >= self.config.broadcast_interval + and workers_that_need_updates + ): + if self.config.policy_states_are_swappable: + self.env_runner.lock() + weights = self.env_runner.get_weights(policy_ids) + if self.config.policy_states_are_swappable: + self.env_runner.unlock() + weights_ref = ray.put(weights) + + self._learner_thread.policy_ids_updated.clear() + self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] = 0 + self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1 + self.env_runner_group.foreach_env_runner( + func=lambda w: w.set_weights(ray.get(weights_ref), global_vars), + local_env_runner=False, + remote_worker_ids=list(workers_that_need_updates), + timeout_seconds=0, # Don't wait for the workers to finish. + ) + + @override(Algorithm) + def _compile_iteration_results_old_api_stack(self, *args, **kwargs): + result = super()._compile_iteration_results_old_api_stack(*args, **kwargs) + if not self.config.enable_rl_module_and_learner: + result = self._learner_thread.add_learner_metrics( + result, overwrite_learner_info=False + ) + return result + + +Impala = IMPALA + + +@OldAPIStack +def make_learner_thread(local_worker, config): + if not config["simple_optimizer"]: + logger.info( + "Enabling multi-GPU mode, {} GPUs, {} parallel tower-stacks".format( + config["num_gpus"], config["num_multi_gpu_tower_stacks"] + ) + ) + num_stacks = config["num_multi_gpu_tower_stacks"] + buffer_size = config["minibatch_buffer_size"] + if num_stacks < buffer_size: + logger.warning( + "In multi-GPU mode you should have at least as many " + "multi-GPU tower stacks (to load data into on one device) as " + "you have stack-index slots in the buffer! You have " + f"configured {num_stacks} stacks and a buffer of size " + f"{buffer_size}. Setting " + f"`minibatch_buffer_size={num_stacks}`." + ) + config["minibatch_buffer_size"] = num_stacks + + learner_thread = MultiGPULearnerThread( + local_worker, + num_gpus=config["num_gpus"], + lr=config["lr"], + train_batch_size=config["train_batch_size"], + num_multi_gpu_tower_stacks=config["num_multi_gpu_tower_stacks"], + num_sgd_iter=config["num_epochs"], + learner_queue_size=config["learner_queue_size"], + learner_queue_timeout=config["learner_queue_timeout"], + num_data_load_threads=config["num_gpu_loader_threads"], + ) + else: + learner_thread = LearnerThread( + local_worker, + minibatch_buffer_size=config["minibatch_buffer_size"], + num_sgd_iter=config["num_epochs"], + learner_queue_size=config["learner_queue_size"], + learner_queue_timeout=config["learner_queue_timeout"], + ) + return learner_thread diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_learner.py new file mode 100644 index 0000000000000000000000000000000000000000..48e04636d003c9d92ed664f901bc9d692308aa33 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_learner.py @@ -0,0 +1,211 @@ +from collections import deque +import threading +import time +from typing import Any, Dict, Union + +import ray +from ray.rllib.algorithms.appo.utils import CircularBuffer +from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY +from ray.rllib.core.learner.learner import Learner +from ray.rllib.core.rl_module.apis import ValueFunctionAPI +from ray.rllib.utils.annotations import ( + override, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict +from ray.rllib.utils.metrics import ( + ALL_MODULES, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger +from ray.rllib.utils.schedules.scheduler import Scheduler +from ray.rllib.utils.typing import ModuleID, ResultDict + +torch, _ = try_import_torch() + +GPU_LOADER_QUEUE_WAIT_TIMER = "gpu_loader_queue_wait_timer" +GPU_LOADER_LOAD_TO_GPU_TIMER = "gpu_loader_load_to_gpu_timer" +LEARNER_THREAD_IN_QUEUE_WAIT_TIMER = "learner_thread_in_queue_wait_timer" +LEARNER_THREAD_ENV_STEPS_DROPPED = "learner_thread_env_steps_dropped" +LEARNER_THREAD_UPDATE_TIMER = "learner_thread_update_timer" +RAY_GET_EPISODES_TIMER = "ray_get_episodes_timer" + +QUEUE_SIZE_GPU_LOADER_QUEUE = "queue_size_gpu_loader_queue" +QUEUE_SIZE_LEARNER_THREAD_QUEUE = "queue_size_learner_thread_queue" +QUEUE_SIZE_RESULTS_QUEUE = "queue_size_results_queue" + +_CURRENT_GLOBAL_TIMESTEPS = None + + +class IMPALALearner(Learner): + @override(Learner) + def build(self) -> None: + super().build() + + # TODO (sven): We replace the dummy RLock here for APPO/IMPALA, b/c these algos + # require this for thread safety reasons. + # An RLock breaks our current OfflineData and OfflinePreLearner logic, in which + # the Learner (which contains a MetricsLogger) is serialized and deserialized. + # We will have to fix this offline RL logic first, then can remove this hack + # here and return to always using the RLock. + self.metrics._threading_lock = threading.RLock() + + # Dict mapping module IDs to the respective entropy Scheduler instance. + self.entropy_coeff_schedulers_per_module: Dict[ + ModuleID, Scheduler + ] = LambdaDefaultDict( + lambda module_id: Scheduler( + fixed_value_or_schedule=( + self.config.get_config_for_module(module_id).entropy_coeff + ), + framework=self.framework, + device=self._device, + ) + ) + + # Default is to have a learner thread. + if not hasattr(self, "_learner_thread_in_queue"): + self._learner_thread_in_queue = deque(maxlen=self.config.learner_queue_size) + + # Create and start the Learner thread. + self._learner_thread = _LearnerThread( + update_method=self._update_from_batch_or_episodes, + in_queue=self._learner_thread_in_queue, + metrics_logger=self.metrics, + ) + self._learner_thread.start() + + @override(Learner) + def update_from_batch( + self, + batch: Any, + *, + timesteps: Dict[str, Any], + **kwargs, + ) -> ResultDict: + global _CURRENT_GLOBAL_TIMESTEPS + _CURRENT_GLOBAL_TIMESTEPS = timesteps or {} + + if isinstance(batch, ray.ObjectRef): + batch = ray.get(batch) + + self.before_gradient_based_update(timesteps=timesteps or {}) + + if isinstance(self._learner_thread_in_queue, CircularBuffer): + ts_dropped = self._learner_thread_in_queue.add(batch) + self.metrics.log_value( + (ALL_MODULES, LEARNER_THREAD_ENV_STEPS_DROPPED), + ts_dropped, + reduce="sum", + ) + # Enqueue to Learner thread's in-queue. + else: + _LearnerThread.enqueue(self._learner_thread_in_queue, batch, self.metrics) + + return self.metrics.reduce() + + @OverrideToImplementCustomLogic_CallToSuperRecommended + def before_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None: + super().before_gradient_based_update(timesteps=timesteps) + + for module_id in self.module.keys(): + # Update entropy coefficient via our Scheduler. + new_entropy_coeff = self.entropy_coeff_schedulers_per_module[ + module_id + ].update(timestep=timesteps.get(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0)) + self.metrics.log_value( + (module_id, LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY), + new_entropy_coeff, + window=1, + ) + + @override(Learner) + def remove_module(self, module_id: str): + super().remove_module(module_id) + self.entropy_coeff_schedulers_per_module.pop(module_id) + + @classmethod + @override(Learner) + def rl_module_required_apis(cls) -> list[type]: + # In order for a PPOLearner to update an RLModule, it must implement the + # following APIs: + return [ValueFunctionAPI] + + +ImpalaLearner = IMPALALearner + + +class _LearnerThread(threading.Thread): + def __init__( + self, + *, + update_method, + in_queue: deque, + metrics_logger, + ): + super().__init__() + self.daemon = True + self.metrics: MetricsLogger = metrics_logger + self.stopped = False + + self._update_method = update_method + self._in_queue: Union[deque, CircularBuffer] = in_queue + + def run(self) -> None: + while not self.stopped: + self.step() + + def step(self): + global _CURRENT_GLOBAL_TIMESTEPS + + # Get a new batch from the GPU-data (deque.pop -> newest item first). + with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_IN_QUEUE_WAIT_TIMER)): + # Get a new batch from the GPU-data (learner queue OR circular buffer). + if isinstance(self._in_queue, CircularBuffer): + ma_batch_on_gpu = self._in_queue.sample() + else: + # Queue is empty: Sleep a tiny bit to avoid CPU-thrashing. + if not self._in_queue: + time.sleep(0.001) + return + # Consume from the left (oldest batches first). + # If we consumed from the right, we would run into the danger of + # learning from newer batches (left side) most times, BUT sometimes + # grabbing older batches (right area of deque). + ma_batch_on_gpu = self._in_queue.popleft() + + # Call the update method on the batch. + with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_UPDATE_TIMER)): + # TODO (sven): For multi-agent AND SGD iter > 1, we need to make sure + # this thread has the information about the min minibatches necessary + # (due to different agents taking different steps in the env, e.g. + # MA-CartPole). + self._update_method( + batch=ma_batch_on_gpu, + timesteps=_CURRENT_GLOBAL_TIMESTEPS, + ) + + @staticmethod + def enqueue(learner_queue: deque, batch, metrics): + # Right-append to learner queue (a deque). If full, drops the leftmost + # (oldest) item in the deque. + # Note that we consume from the left (oldest first), which is why the queue size + # should probably always be small'ish (<< 10), otherwise we run into the danger + # of training with very old samples. + # If we consumed from the right, we would run into the danger of learning + # from newer batches (left side) most times, BUT sometimes grabbing a + # really old batches (right area of deque). + if len(learner_queue) == learner_queue.maxlen: + metrics.log_value( + (ALL_MODULES, LEARNER_THREAD_ENV_STEPS_DROPPED), + learner_queue.popleft().env_steps(), + reduce="sum", + ) + learner_queue.append(batch) + + # Log current queue size. + metrics.log_value( + (ALL_MODULES, QUEUE_SIZE_LEARNER_THREAD_QUEUE), + len(learner_queue), + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_tf_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_tf_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..d06d0065b124c7e45b3b69194d34b23c9eddc66f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_tf_policy.py @@ -0,0 +1,443 @@ +"""Adapted from A3CTFPolicy to add V-trace. + +Keep in sync with changes to A3CTFPolicy and VtraceSurrogatePolicy.""" + +import numpy as np +import logging +import gymnasium as gym +from typing import Dict, List, Optional, Type, Union + +from ray.rllib.algorithms.impala import vtrace_tf as vtrace +from ray.rllib.evaluation.postprocessing import compute_bootstrap_value +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_action_dist import Categorical, TFActionDistribution +from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 +from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.tf_mixins import LearningRateSchedule, EntropyCoeffSchedule +from ray.rllib.utils import force_list +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.tf_utils import explained_variance +from ray.rllib.policy.tf_mixins import GradStatsMixin, ValueNetworkMixin +from ray.rllib.utils.typing import ( + LocalOptimizer, + ModelGradients, + TensorType, + TFPolicyV2Type, +) + +tf1, tf, tfv = try_import_tf() + +logger = logging.getLogger(__name__) + + +class VTraceLoss: + def __init__( + self, + actions, + actions_logp, + actions_entropy, + dones, + behaviour_action_logp, + behaviour_logits, + target_logits, + discount, + rewards, + values, + bootstrap_value, + dist_class, + model, + valid_mask, + config, + vf_loss_coeff=0.5, + entropy_coeff=0.01, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, + ): + """Policy gradient loss with vtrace importance weighting. + + VTraceLoss takes tensors of shape [T, B, ...], where `B` is the + batch_size. The reason we need to know `B` is for V-trace to properly + handle episode cut boundaries. + + Args: + actions: An int|float32 tensor of shape [T, B, ACTION_SPACE]. + actions_logp: A float32 tensor of shape [T, B]. + actions_entropy: A float32 tensor of shape [T, B]. + dones: A bool tensor of shape [T, B]. + behaviour_action_logp: Tensor of shape [T, B]. + behaviour_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes + [T, B, ACTION_SPACE[0]], + ..., + [T, B, ACTION_SPACE[-1]] + target_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes + [T, B, ACTION_SPACE[0]], + ..., + [T, B, ACTION_SPACE[-1]] + discount: A float32 scalar. + rewards: A float32 tensor of shape [T, B]. + values: A float32 tensor of shape [T, B]. + bootstrap_value: A float32 tensor of shape [B]. + dist_class: action distribution class for logits. + valid_mask: A bool tensor of valid RNN input elements (#2992). + config: Algorithm config dict. + """ + + # Compute vtrace on the CPU for better performance. + with tf.device("/cpu:0"): + self.vtrace_returns = vtrace.multi_from_logits( + behaviour_action_log_probs=behaviour_action_logp, + behaviour_policy_logits=behaviour_logits, + target_policy_logits=target_logits, + actions=tf.unstack(actions, axis=2), + discounts=tf.cast(~tf.cast(dones, tf.bool), tf.float32) * discount, + rewards=rewards, + values=values, + bootstrap_value=bootstrap_value, + dist_class=dist_class, + model=model, + clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), + clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32), + ) + self.value_targets = self.vtrace_returns.vs + + # The policy gradients loss. + masked_pi_loss = tf.boolean_mask( + actions_logp * self.vtrace_returns.pg_advantages, valid_mask + ) + self.pi_loss = -tf.reduce_sum(masked_pi_loss) + self.mean_pi_loss = -tf.reduce_mean(masked_pi_loss) + + # The baseline loss. + delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask) + delta_squarred = tf.math.square(delta) + self.vf_loss = 0.5 * tf.reduce_sum(delta_squarred) + self.mean_vf_loss = 0.5 * tf.reduce_mean(delta_squarred) + + # The entropy loss. + masked_entropy = tf.boolean_mask(actions_entropy, valid_mask) + self.entropy = tf.reduce_sum(masked_entropy) + self.mean_entropy = tf.reduce_mean(masked_entropy) + + # The summed weighted loss. + self.total_loss = self.pi_loss - self.entropy * entropy_coeff + + # Optional vf loss (or in a separate term due to separate + # optimizers/networks). + self.loss_wo_vf = self.total_loss + if not config["_separate_vf_optimizer"]: + self.total_loss += self.vf_loss * vf_loss_coeff + + +def _make_time_major(policy, seq_lens, tensor): + """Swaps batch and trajectory axis. + + Args: + policy: Policy reference + seq_lens: Sequence lengths if recurrent or None + tensor: A tensor or list of tensors to reshape. + trajectory item. + + Returns: + res: A tensor with swapped axes or a list of tensors with + swapped axes. + """ + if isinstance(tensor, list): + return [_make_time_major(policy, seq_lens, t) for t in tensor] + + if policy.is_recurrent(): + B = tf.shape(seq_lens)[0] + T = tf.shape(tensor)[0] // B + else: + # Important: chop the tensor into batches at known episode cut + # boundaries. + # TODO: (sven) this is kind of a hack and won't work for + # batch_mode=complete_episodes. + T = policy.config["rollout_fragment_length"] + B = tf.shape(tensor)[0] // T + rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) + + # swap B and T axes + res = tf.transpose(rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) + + return res + + +class VTraceClipGradients: + """VTrace version of gradient computation logic.""" + + def __init__(self): + """No special initialization required.""" + pass + + def compute_gradients_fn( + self, optimizer: LocalOptimizer, loss: TensorType + ) -> ModelGradients: + # Supporting more than one loss/optimizer. + trainable_variables = self.model.trainable_variables() + if self.config["_tf_policy_handles_more_than_one_loss"]: + optimizers = force_list(optimizer) + losses = force_list(loss) + assert len(optimizers) == len(losses) + clipped_grads_and_vars = [] + for optim, loss_ in zip(optimizers, losses): + grads_and_vars = optim.compute_gradients(loss_, trainable_variables) + clipped_g_and_v = [] + for g, v in grads_and_vars: + if g is not None: + clipped_g, _ = tf.clip_by_global_norm( + [g], self.config["grad_clip"] + ) + clipped_g_and_v.append((clipped_g[0], v)) + clipped_grads_and_vars.append(clipped_g_and_v) + + self.grads = [g for g_and_v in clipped_grads_and_vars for (g, v) in g_and_v] + # Only one optimizer and and loss term. + else: + grads_and_vars = optimizer.compute_gradients( + loss, self.model.trainable_variables() + ) + grads = [g for (g, v) in grads_and_vars] + self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"]) + clipped_grads_and_vars = list(zip(self.grads, trainable_variables)) + + return clipped_grads_and_vars + + +class VTraceOptimizer: + """Optimizer function for VTrace policies.""" + + def __init__(self): + pass + + # TODO: maybe standardize this function, so the choice of optimizers are more + # predictable for common algorithms. + def optimizer( + self, + ) -> Union["tf.keras.optimizers.Optimizer", List["tf.keras.optimizers.Optimizer"]]: + config = self.config + if config["opt_type"] == "adam": + if config["framework"] == "tf2": + optim = tf.keras.optimizers.Adam(self.cur_lr) + if config["_separate_vf_optimizer"]: + return optim, tf.keras.optimizers.Adam(config["_lr_vf"]) + else: + optim = tf1.train.AdamOptimizer(self.cur_lr) + if config["_separate_vf_optimizer"]: + return optim, tf1.train.AdamOptimizer(config["_lr_vf"]) + else: + if config["_separate_vf_optimizer"]: + raise ValueError( + "RMSProp optimizer not supported for separate" + "vf- and policy losses yet! Set `opt_type=adam`" + ) + + if tfv == 2: + optim = tf.keras.optimizers.RMSprop( + self.cur_lr, config["decay"], config["momentum"], config["epsilon"] + ) + else: + optim = tf1.train.RMSPropOptimizer( + self.cur_lr, config["decay"], config["momentum"], config["epsilon"] + ) + + return optim + + +# We need this builder function because we want to share the same +# custom logics between TF1 dynamic and TF2 eager policies. +def get_impala_tf_policy(name: str, base: TFPolicyV2Type) -> TFPolicyV2Type: + """Construct an ImpalaTFPolicy inheriting either dynamic or eager base policies. + + Args: + base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. + + Returns: + A TF Policy to be used with Impala. + """ + # VTrace mixins are placed in front of more general mixins to make sure + # their functions like optimizer() overrides all the other implementations + # (e.g., LearningRateSchedule.optimizer()) + class ImpalaTFPolicy( + VTraceClipGradients, + VTraceOptimizer, + LearningRateSchedule, + EntropyCoeffSchedule, + GradStatsMixin, + ValueNetworkMixin, + base, + ): + def __init__( + self, + observation_space, + action_space, + config, + existing_model=None, + existing_inputs=None, + ): + # First thing first, enable eager execution if necessary. + base.enable_eager_execution_if_necessary() + + # Initialize base class. + base.__init__( + self, + observation_space, + action_space, + config, + existing_inputs=existing_inputs, + existing_model=existing_model, + ) + ValueNetworkMixin.__init__(self, config) + + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. + GradStatsMixin.__init__(self) + VTraceClipGradients.__init__(self) + VTraceOptimizer.__init__(self) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() + + @override(base) + def loss( + self, + model: Union[ModelV2, "tf.keras.Model"], + dist_class: Type[TFActionDistribution], + train_batch: SampleBatch, + ) -> Union[TensorType, List[TensorType]]: + model_out, _ = model(train_batch) + action_dist = dist_class(model_out, model) + + if isinstance(self.action_space, gym.spaces.Discrete): + is_multidiscrete = False + output_hidden_shape = [self.action_space.n] + elif isinstance(self.action_space, gym.spaces.MultiDiscrete): + is_multidiscrete = True + output_hidden_shape = self.action_space.nvec.astype(np.int32) + else: + is_multidiscrete = False + output_hidden_shape = 1 + + def make_time_major(*args, **kw): + return _make_time_major( + self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw + ) + + actions = train_batch[SampleBatch.ACTIONS] + dones = train_batch[SampleBatch.TERMINATEDS] + rewards = train_batch[SampleBatch.REWARDS] + behaviour_action_logp = train_batch[SampleBatch.ACTION_LOGP] + behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS] + unpacked_behaviour_logits = tf.split( + behaviour_logits, output_hidden_shape, axis=1 + ) + unpacked_outputs = tf.split(model_out, output_hidden_shape, axis=1) + values = model.value_function() + values_time_major = make_time_major(values) + bootstrap_values_time_major = make_time_major( + train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + bootstrap_value = bootstrap_values_time_major[-1] + + if self.is_recurrent(): + max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) + mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) + mask = tf.reshape(mask, [-1]) + else: + mask = tf.ones_like(rewards) + + # Prepare actions for loss + loss_actions = ( + actions if is_multidiscrete else tf.expand_dims(actions, axis=1) + ) + + # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc. + self.vtrace_loss = VTraceLoss( + actions=make_time_major(loss_actions), + actions_logp=make_time_major(action_dist.logp(actions)), + actions_entropy=make_time_major(action_dist.multi_entropy()), + dones=make_time_major(dones), + behaviour_action_logp=make_time_major(behaviour_action_logp), + behaviour_logits=make_time_major(unpacked_behaviour_logits), + target_logits=make_time_major(unpacked_outputs), + discount=self.config["gamma"], + rewards=make_time_major(rewards), + values=values_time_major, + bootstrap_value=bootstrap_value, + dist_class=Categorical if is_multidiscrete else dist_class, + model=model, + valid_mask=make_time_major(mask), + config=self.config, + vf_loss_coeff=self.config["vf_loss_coeff"], + entropy_coeff=self.entropy_coeff, + clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], + clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], + ) + + if self.config.get("_separate_vf_optimizer"): + return self.vtrace_loss.loss_wo_vf, self.vtrace_loss.vf_loss + else: + return self.vtrace_loss.total_loss + + @override(base) + def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: + values_batched = _make_time_major( + self, + train_batch.get(SampleBatch.SEQ_LENS), + self.model.value_function(), + ) + + return { + "cur_lr": tf.cast(self.cur_lr, tf.float64), + "policy_loss": self.vtrace_loss.mean_pi_loss, + "entropy": self.vtrace_loss.mean_entropy, + "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), + "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables()), + "vf_loss": self.vtrace_loss.mean_vf_loss, + "vf_explained_var": explained_variance( + tf.reshape(self.vtrace_loss.value_targets, [-1]), + tf.reshape(values_batched, [-1]), + ), + } + + @override(base) + def postprocess_trajectory( + self, + sample_batch: SampleBatch, + other_agent_batches: Optional[SampleBatch] = None, + episode=None, + ): + # Call super's postprocess_trajectory first. + # sample_batch = super().postprocess_trajectory( + # sample_batch, other_agent_batches, episode + # ) + + if self.config["vtrace"]: + # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need + # inside the loss for vtrace calculations. + sample_batch = compute_bootstrap_value(sample_batch, self) + + return sample_batch + + @override(base) + def get_batch_divisibility_req(self) -> int: + return self.config["rollout_fragment_length"] + + ImpalaTFPolicy.__name__ = name + ImpalaTFPolicy.__qualname__ = name + + return ImpalaTFPolicy + + +ImpalaTF1Policy = get_impala_tf_policy("ImpalaTF1Policy", DynamicTFPolicyV2) +ImpalaTF2Policy = get_impala_tf_policy("ImpalaTF2Policy", EagerTFPolicyV2) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_torch_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_torch_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..c174149f7c6042c6bf7b7984cde06359dba718f7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/impala_torch_policy.py @@ -0,0 +1,424 @@ +import gymnasium as gym +import logging +import numpy as np +from typing import Dict, List, Optional, Type, Union + +import ray +from ray.rllib.evaluation.postprocessing import compute_bootstrap_value +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.action_dist import ActionDistribution +from ray.rllib.models.torch.torch_action_dist import TorchCategorical +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.torch_mixins import ( + EntropyCoeffSchedule, + LearningRateSchedule, + ValueNetworkMixin, +) +from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.torch_utils import ( + apply_grad_clipping, + explained_variance, + global_norm, + sequence_mask, +) +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + +logger = logging.getLogger(__name__) + + +class VTraceLoss: + def __init__( + self, + actions, + actions_logp, + actions_entropy, + dones, + behaviour_action_logp, + behaviour_logits, + target_logits, + discount, + rewards, + values, + bootstrap_value, + dist_class, + model, + valid_mask, + config, + vf_loss_coeff=0.5, + entropy_coeff=0.01, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, + ): + """Policy gradient loss with vtrace importance weighting. + + VTraceLoss takes tensors of shape [T, B, ...], where `B` is the + batch_size. The reason we need to know `B` is for V-trace to properly + handle episode cut boundaries. + + Args: + actions: An int|float32 tensor of shape [T, B, ACTION_SPACE]. + actions_logp: A float32 tensor of shape [T, B]. + actions_entropy: A float32 tensor of shape [T, B]. + dones: A bool tensor of shape [T, B]. + behaviour_action_logp: Tensor of shape [T, B]. + behaviour_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes + [T, B, ACTION_SPACE[0]], + ..., + [T, B, ACTION_SPACE[-1]] + target_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes + [T, B, ACTION_SPACE[0]], + ..., + [T, B, ACTION_SPACE[-1]] + discount: A float32 scalar. + rewards: A float32 tensor of shape [T, B]. + values: A float32 tensor of shape [T, B]. + bootstrap_value: A float32 tensor of shape [B]. + dist_class: action distribution class for logits. + valid_mask: A bool tensor of valid RNN input elements (#2992). + config: Algorithm config dict. + """ + import ray.rllib.algorithms.impala.vtrace_torch as vtrace + + if valid_mask is None: + valid_mask = torch.ones_like(actions_logp) + + # Compute vtrace on the CPU for better perf + # (devices handled inside `vtrace.multi_from_logits`). + device = behaviour_action_logp[0].device + self.vtrace_returns = vtrace.multi_from_logits( + behaviour_action_log_probs=behaviour_action_logp, + behaviour_policy_logits=behaviour_logits, + target_policy_logits=target_logits, + actions=torch.unbind(actions, dim=2), + discounts=(1.0 - dones.float()) * discount, + rewards=rewards, + values=values, + bootstrap_value=bootstrap_value, + dist_class=dist_class, + model=model, + clip_rho_threshold=clip_rho_threshold, + clip_pg_rho_threshold=clip_pg_rho_threshold, + ) + # Move v-trace results back to GPU for actual loss computing. + self.value_targets = self.vtrace_returns.vs.to(device) + + # The policy gradients loss. + self.pi_loss = -torch.sum( + actions_logp * self.vtrace_returns.pg_advantages.to(device) * valid_mask + ) + + # The baseline loss. + delta = (values - self.value_targets) * valid_mask + self.vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0)) + + # The entropy loss. + self.entropy = torch.sum(actions_entropy * valid_mask) + self.mean_entropy = self.entropy / torch.sum(valid_mask) + + # The summed weighted loss. + self.total_loss = self.pi_loss - self.entropy * entropy_coeff + + # Optional vf loss (or in a separate term due to separate + # optimizers/networks). + self.loss_wo_vf = self.total_loss + if not config["_separate_vf_optimizer"]: + self.total_loss += self.vf_loss * vf_loss_coeff + + +def make_time_major(policy, seq_lens, tensor): + """Swaps batch and trajectory axis. + + Args: + policy: Policy reference + seq_lens: Sequence lengths if recurrent or None + tensor: A tensor or list of tensors to reshape. + + Returns: + res: A tensor with swapped axes or a list of tensors with + swapped axes. + """ + if isinstance(tensor, (list, tuple)): + return [make_time_major(policy, seq_lens, t) for t in tensor] + + if policy.is_recurrent(): + B = seq_lens.shape[0] + T = tensor.shape[0] // B + else: + # Important: chop the tensor into batches at known episode cut + # boundaries. + # TODO: (sven) this is kind of a hack and won't work for + # batch_mode=complete_episodes. + T = policy.config["rollout_fragment_length"] + B = tensor.shape[0] // T + rs = torch.reshape(tensor, [B, T] + list(tensor.shape[1:])) + + # Swap B and T axes. + res = torch.transpose(rs, 1, 0) + + return res + + +class VTraceOptimizer: + """Optimizer function for VTrace torch policies.""" + + def __init__(self): + pass + + def optimizer( + self, + ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]: + + if self.config["_separate_vf_optimizer"]: + # Figure out, which parameters of the model belong to the value + # function (and which to the policy net). + dummy_batch = self._lazy_tensor_dict( + self._get_dummy_batch_from_view_requirements() + ) + # Zero out all gradients (set to None) + for param in self.model.parameters(): + param.grad = None + # Perform a dummy forward pass (through the policy net, which should be + # separated from the value function in this particular user setup). + out = self.model(dummy_batch) + # Perform a (dummy) backward pass to be able to see, which params have + # gradients and are therefore used for the policy computations (vs vf + # computations). + torch.sum(out[0]).backward() # [0] -> Model returns out and state-outs. + # Collect policy vs value function params separately. + policy_params = [] + value_params = [] + for param in self.model.parameters(): + if param.grad is None: + value_params.append(param) + else: + policy_params.append(param) + if self.config["opt_type"] == "adam": + return ( + torch.optim.Adam(params=policy_params, lr=self.cur_lr), + torch.optim.Adam(params=value_params, lr=self.cur_lr2), + ) + else: + raise NotImplementedError + + if self.config["opt_type"] == "adam": + return torch.optim.Adam(params=self.model.parameters(), lr=self.cur_lr) + else: + return torch.optim.RMSprop( + params=self.model.parameters(), + lr=self.cur_lr, + weight_decay=self.config["decay"], + momentum=self.config["momentum"], + eps=self.config["epsilon"], + ) + + +# VTrace mixins are placed in front of more general mixins to make sure +# their functions like optimizer() overrides all the other implementations +# (e.g., LearningRateSchedule.optimizer()) +class ImpalaTorchPolicy( + VTraceOptimizer, + LearningRateSchedule, + EntropyCoeffSchedule, + ValueNetworkMixin, + TorchPolicyV2, +): + """PyTorch policy class used with IMPALA.""" + + def __init__(self, observation_space, action_space, config): + config = dict( + ray.rllib.algorithms.impala.impala.IMPALAConfig().to_dict(), **config + ) + config["enable_rl_module_and_learner"] = False + config["enable_env_runner_and_connector_v2"] = False + + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. + VTraceOptimizer.__init__(self) + # Need to initialize learning rate variable before calling + # TorchPolicyV2.__init__. + lr_schedule_additional_args = [] + if config.get("_separate_vf_optimizer"): + lr_schedule_additional_args = ( + [config["_lr_vf"][0][1], config["_lr_vf"]] + if isinstance(config["_lr_vf"], (list, tuple)) + else [config["_lr_vf"], None] + ) + LearningRateSchedule.__init__( + self, config["lr"], config["lr_schedule"], *lr_schedule_additional_args + ) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + + TorchPolicyV2.__init__( + self, + observation_space, + action_space, + config, + max_seq_len=config["model"]["max_seq_len"], + ) + + ValueNetworkMixin.__init__(self, config) + + self._initialize_loss_from_dummy_batch() + + @override(TorchPolicyV2) + def loss( + self, + model: ModelV2, + dist_class: Type[ActionDistribution], + train_batch: SampleBatch, + ) -> Union[TensorType, List[TensorType]]: + model_out, _ = model(train_batch) + action_dist = dist_class(model_out, model) + + if isinstance(self.action_space, gym.spaces.Discrete): + is_multidiscrete = False + output_hidden_shape = [self.action_space.n] + elif isinstance(self.action_space, gym.spaces.MultiDiscrete): + is_multidiscrete = True + output_hidden_shape = self.action_space.nvec.astype(np.int32) + else: + is_multidiscrete = False + output_hidden_shape = 1 + + def _make_time_major(*args, **kw): + return make_time_major( + self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw + ) + + actions = train_batch[SampleBatch.ACTIONS] + dones = train_batch[SampleBatch.TERMINATEDS] + rewards = train_batch[SampleBatch.REWARDS] + behaviour_action_logp = train_batch[SampleBatch.ACTION_LOGP] + behaviour_logits = train_batch[SampleBatch.ACTION_DIST_INPUTS] + if isinstance(output_hidden_shape, (list, tuple, np.ndarray)): + unpacked_behaviour_logits = torch.split( + behaviour_logits, list(output_hidden_shape), dim=1 + ) + unpacked_outputs = torch.split(model_out, list(output_hidden_shape), dim=1) + else: + unpacked_behaviour_logits = torch.chunk( + behaviour_logits, output_hidden_shape, dim=1 + ) + unpacked_outputs = torch.chunk(model_out, output_hidden_shape, dim=1) + values = model.value_function() + values_time_major = _make_time_major(values) + bootstrap_values_time_major = _make_time_major( + train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + bootstrap_value = bootstrap_values_time_major[-1] + + if self.is_recurrent(): + max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS]) + mask_orig = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) + mask = torch.reshape(mask_orig, [-1]) + else: + mask = torch.ones_like(rewards) + + # Prepare actions for loss. + loss_actions = actions if is_multidiscrete else torch.unsqueeze(actions, dim=1) + + # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc. + loss = VTraceLoss( + actions=_make_time_major(loss_actions), + actions_logp=_make_time_major(action_dist.logp(actions)), + actions_entropy=_make_time_major(action_dist.entropy()), + dones=_make_time_major(dones), + behaviour_action_logp=_make_time_major(behaviour_action_logp), + behaviour_logits=_make_time_major(unpacked_behaviour_logits), + target_logits=_make_time_major(unpacked_outputs), + discount=self.config["gamma"], + rewards=_make_time_major(rewards), + values=values_time_major, + bootstrap_value=bootstrap_value, + dist_class=TorchCategorical if is_multidiscrete else dist_class, + model=model, + valid_mask=_make_time_major(mask), + config=self.config, + vf_loss_coeff=self.config["vf_loss_coeff"], + entropy_coeff=self.entropy_coeff, + clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], + clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], + ) + + # Store values for stats function in model (tower), such that for + # multi-GPU, we do not override them during the parallel loss phase. + model.tower_stats["pi_loss"] = loss.pi_loss + model.tower_stats["vf_loss"] = loss.vf_loss + model.tower_stats["entropy"] = loss.entropy + model.tower_stats["mean_entropy"] = loss.mean_entropy + model.tower_stats["total_loss"] = loss.total_loss + + values_batched = make_time_major( + self, + train_batch.get(SampleBatch.SEQ_LENS), + values, + ) + model.tower_stats["vf_explained_var"] = explained_variance( + torch.reshape(loss.value_targets, [-1]), torch.reshape(values_batched, [-1]) + ) + + if self.config.get("_separate_vf_optimizer"): + return loss.loss_wo_vf, loss.vf_loss + else: + return loss.total_loss + + @override(TorchPolicyV2) + def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: + return convert_to_numpy( + { + "cur_lr": self.cur_lr, + "total_loss": torch.mean( + torch.stack(self.get_tower_stats("total_loss")) + ), + "policy_loss": torch.mean(torch.stack(self.get_tower_stats("pi_loss"))), + "entropy": torch.mean( + torch.stack(self.get_tower_stats("mean_entropy")) + ), + "entropy_coeff": self.entropy_coeff, + "var_gnorm": global_norm(self.model.trainable_variables()), + "vf_loss": torch.mean(torch.stack(self.get_tower_stats("vf_loss"))), + "vf_explained_var": torch.mean( + torch.stack(self.get_tower_stats("vf_explained_var")) + ), + } + ) + + @override(TorchPolicyV2) + def postprocess_trajectory( + self, + sample_batch: SampleBatch, + other_agent_batches: Optional[SampleBatch] = None, + episode=None, + ): + # Call super's postprocess_trajectory first. + # sample_batch = super().postprocess_trajectory( + # sample_batch, other_agent_batches, episode + # ) + + if self.config["vtrace"]: + # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need + # inside the loss for vtrace calculations. + sample_batch = compute_bootstrap_value(sample_batch, self) + + return sample_batch + + @override(TorchPolicyV2) + def extra_grad_process( + self, optimizer: "torch.optim.Optimizer", loss: TensorType + ) -> Dict[str, TensorType]: + return apply_grad_clipping(self, optimizer, loss) + + @override(TorchPolicyV2) + def get_batch_divisibility_req(self) -> int: + return self.config["rollout_fragment_length"] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/utils.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f65b76a7f53de6fb3f436b8063341c057529f25b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/utils.py @@ -0,0 +1,96 @@ +from collections import defaultdict, deque + +import numpy as np + + +class _SleepTimeController: + def __init__(self): + self.L = 0.0 + self.H = 0.4 + + self._recompute_candidates() + + # Defaultdict mapping. + self.results = defaultdict(lambda: deque(maxlen=3)) + + self.iteration = 0 + + def _recompute_candidates(self): + self.center = (self.L + self.H) / 2 + self.low = (self.L + self.center) / 2 + self.high = (self.H + self.center) / 2 + + # Expand a little if range becomes too narrow to avoid + # overoptimization. + if self.H - self.L < 0.00001: + self.L = max(self.center - 0.1, 0.0) + self.H = min(self.center + 0.1, 1.0) + self._recompute_candidates() + # Reduce results, just in case it has grown too much. + c, l, h = ( + self.results[self.center], + self.results[self.low], + self.results[self.high], + ) + self.results = defaultdict(lambda: deque(maxlen=3)) + self.results[self.center] = c + self.results[self.low] = l + self.results[self.high] = h + + @property + def current(self): + if len(self.results[self.center]) < 3: + return self.center + elif len(self.results[self.low]) < 3: + return self.low + else: + return self.high + + def log_result(self, performance): + self.iteration += 1 + + # Skip first 2 iterations for ignoring warm-up effect. + if self.iteration < 2: + return + + self.results[self.current].append(performance) + + # If all candidates have at least 3 results logged, re-evaluate + # and compute new L and H. + center, low, high = self.center, self.low, self.high + if ( + len(self.results[center]) == 3 + and len(self.results[low]) == 3 + and len(self.results[high]) == 3 + ): + perf_center = np.mean(self.results[center]) + perf_low = np.mean(self.results[low]) + perf_high = np.mean(self.results[high]) + # Case: `center` is best. + if perf_center > perf_low and perf_center > perf_high: + self.L = low + self.H = high + # Erase low/high results: We'll not use these again. + self.results.pop(low, None) + self.results.pop(high, None) + # Case: `low` is best. + elif perf_low > perf_center and perf_low > perf_high: + self.H = center + # Erase center/high results: We'll not use these again. + self.results.pop(center, None) + self.results.pop(high, None) + # Case: `high` is best. + else: + self.L = center + # Erase center/low results: We'll not use these again. + self.results.pop(center, None) + self.results.pop(low, None) + + self._recompute_candidates() + + +if __name__ == "__main__": + controller = _SleepTimeController() + for _ in range(1000): + performance = np.random.random() + controller.log_result(performance) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_tf.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..61c6a7e366d52e20a0d2c225749461c81c51b04d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_tf.py @@ -0,0 +1,425 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions to compute V-trace off-policy actor critic targets. + +For details and theory see: + +"IMPALA: Scalable Distributed Deep-RL with +Importance Weighted Actor-Learner Architectures" +by Espeholt, Soyer, Munos et al. + +See https://arxiv.org/abs/1802.01561 for the full paper. + +In addition to the original paper's code, changes have been made +to support MultiDiscrete action spaces. behaviour_policy_logits, +target_policy_logits and actions parameters in the entry point +multi_from_logits method accepts lists of tensors instead of just +tensors. +""" + +import collections + +from ray.rllib.models.tf.tf_action_dist import Categorical +from ray.rllib.utils.framework import try_import_tf + +tf1, tf, tfv = try_import_tf() + +VTraceFromLogitsReturns = collections.namedtuple( + "VTraceFromLogitsReturns", + [ + "vs", + "pg_advantages", + "log_rhos", + "behaviour_action_log_probs", + "target_action_log_probs", + ], +) + +VTraceReturns = collections.namedtuple("VTraceReturns", "vs pg_advantages") + + +def log_probs_from_logits_and_actions( + policy_logits, actions, dist_class=Categorical, model=None +): + return multi_log_probs_from_logits_and_actions( + [policy_logits], [actions], dist_class, model + )[0] + + +def multi_log_probs_from_logits_and_actions(policy_logits, actions, dist_class, model): + """Computes action log-probs from policy logits and actions. + + In the notation used throughout documentation and comments, T refers to the + time dimension ranging from 0 to T-1. B refers to the batch size and + ACTION_SPACE refers to the list of numbers each representing a number of + actions. + + Args: + policy_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes [T, B, ACTION_SPACE[0]], ..., + [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities + parameterizing a softmax policy. + actions: A list with length of ACTION_SPACE of tensors of shapes + [T, B, ...], ..., [T, B, ...] + with actions. + dist_class: Python class of the action distribution. + + Returns: + A list with length of ACTION_SPACE of float32 tensors of shapes + [T, B], ..., [T, B] corresponding to the sampling log probability + of the chosen action w.r.t. the policy. + """ + log_probs = [] + for i in range(len(policy_logits)): + p_shape = tf.shape(policy_logits[i]) + a_shape = tf.shape(actions[i]) + policy_logits_flat = tf.reshape( + policy_logits[i], tf.concat([[-1], p_shape[2:]], axis=0) + ) + actions_flat = tf.reshape(actions[i], tf.concat([[-1], a_shape[2:]], axis=0)) + log_probs.append( + tf.reshape( + dist_class(policy_logits_flat, model).logp(actions_flat), a_shape[:2] + ) + ) + + return log_probs + + +def from_logits( + behaviour_policy_logits, + target_policy_logits, + actions, + discounts, + rewards, + values, + bootstrap_value, + dist_class=Categorical, + model=None, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, + name="vtrace_from_logits", +): + """multi_from_logits wrapper used only for tests""" + + res = multi_from_logits( + [behaviour_policy_logits], + [target_policy_logits], + [actions], + discounts, + rewards, + values, + bootstrap_value, + dist_class, + model, + clip_rho_threshold=clip_rho_threshold, + clip_pg_rho_threshold=clip_pg_rho_threshold, + name=name, + ) + + return VTraceFromLogitsReturns( + vs=res.vs, + pg_advantages=res.pg_advantages, + log_rhos=res.log_rhos, + behaviour_action_log_probs=tf.squeeze(res.behaviour_action_log_probs, axis=0), + target_action_log_probs=tf.squeeze(res.target_action_log_probs, axis=0), + ) + + +def multi_from_logits( + behaviour_policy_logits, + target_policy_logits, + actions, + discounts, + rewards, + values, + bootstrap_value, + dist_class, + model, + behaviour_action_log_probs=None, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, + name="vtrace_from_logits", +): + r"""V-trace for softmax policies. + + Calculates V-trace actor critic targets for softmax polices as described in + + "IMPALA: Scalable Distributed Deep-RL with + Importance Weighted Actor-Learner Architectures" + by Espeholt, Soyer, Munos et al. + + Target policy refers to the policy we are interested in improving and + behaviour policy refers to the policy that generated the given + rewards and actions. + + In the notation used throughout documentation and comments, T refers to the + time dimension ranging from 0 to T-1. B refers to the batch size and + ACTION_SPACE refers to the list of numbers each representing a number of + actions. + + Args: + behaviour_policy_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes + [T, B, ACTION_SPACE[0]], + ..., + [T, B, ACTION_SPACE[-1]] + with un-normalized log-probabilities parameterizing the softmax behaviour + policy. + target_policy_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes + [T, B, ACTION_SPACE[0]], + ..., + [T, B, ACTION_SPACE[-1]] + with un-normalized log-probabilities parameterizing the softmax target + policy. + actions: A list with length of ACTION_SPACE of + tensors of shapes + [T, B, ...], + ..., + [T, B, ...] + with actions sampled from the behaviour policy. + discounts: A float32 tensor of shape [T, B] with the discount encountered + when following the behaviour policy. + rewards: A float32 tensor of shape [T, B] with the rewards generated by + following the behaviour policy. + values: A float32 tensor of shape [T, B] with the value function estimates + wrt. the target policy. + bootstrap_value: A float32 of shape [B] with the value function estimate at + time T. + dist_class: action distribution class for the logits. + model: backing ModelV2 instance + behaviour_action_log_probs: precalculated values of the behaviour actions + clip_rho_threshold: A scalar float32 tensor with the clipping threshold for + importance weights (rho) when calculating the baseline targets (vs). + rho^bar in the paper. + clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold + on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). + name: The name scope that all V-trace operations will be created in. + + Returns: + A `VTraceFromLogitsReturns` namedtuple with the following fields: + vs: A float32 tensor of shape [T, B]. Can be used as target to train a + baseline (V(x_t) - vs_t)^2. + pg_advantages: A float 32 tensor of shape [T, B]. Can be used as an + estimate of the advantage in the calculation of policy gradients. + log_rhos: A float32 tensor of shape [T, B] containing the log importance + sampling weights (log rhos). + behaviour_action_log_probs: A float32 tensor of shape [T, B] containing + behaviour policy action log probabilities (log \mu(a_t)). + target_action_log_probs: A float32 tensor of shape [T, B] containing + target policy action probabilities (log \pi(a_t)). + """ + + for i in range(len(behaviour_policy_logits)): + behaviour_policy_logits[i] = tf.convert_to_tensor( + behaviour_policy_logits[i], dtype=tf.float32 + ) + target_policy_logits[i] = tf.convert_to_tensor( + target_policy_logits[i], dtype=tf.float32 + ) + + # Make sure tensor ranks are as expected. + # The rest will be checked by from_action_log_probs. + behaviour_policy_logits[i].shape.assert_has_rank(3) + target_policy_logits[i].shape.assert_has_rank(3) + + with tf1.name_scope( + name, + values=[ + behaviour_policy_logits, + target_policy_logits, + actions, + discounts, + rewards, + values, + bootstrap_value, + ], + ): + target_action_log_probs = multi_log_probs_from_logits_and_actions( + target_policy_logits, actions, dist_class, model + ) + + if len(behaviour_policy_logits) > 1 or behaviour_action_log_probs is None: + # can't use precalculated values, recompute them. Note that + # recomputing won't work well for autoregressive action dists + # which may have variables not captured by 'logits' + behaviour_action_log_probs = multi_log_probs_from_logits_and_actions( + behaviour_policy_logits, actions, dist_class, model + ) + + log_rhos = get_log_rhos(target_action_log_probs, behaviour_action_log_probs) + + vtrace_returns = from_importance_weights( + log_rhos=log_rhos, + discounts=discounts, + rewards=rewards, + values=values, + bootstrap_value=bootstrap_value, + clip_rho_threshold=clip_rho_threshold, + clip_pg_rho_threshold=clip_pg_rho_threshold, + ) + + return VTraceFromLogitsReturns( + log_rhos=log_rhos, + behaviour_action_log_probs=behaviour_action_log_probs, + target_action_log_probs=target_action_log_probs, + **vtrace_returns._asdict() + ) + + +def from_importance_weights( + log_rhos, + discounts, + rewards, + values, + bootstrap_value, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, + name="vtrace_from_importance_weights", +): + r"""V-trace from log importance weights. + + Calculates V-trace actor critic targets as described in + + "IMPALA: Scalable Distributed Deep-RL with + Importance Weighted Actor-Learner Architectures" + by Espeholt, Soyer, Munos et al. + + In the notation used throughout documentation and comments, T refers to the + time dimension ranging from 0 to T-1. B refers to the batch size. This code + also supports the case where all tensors have the same number of additional + dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C], + `bootstrap_value` is [B, C]. + + Args: + log_rhos: A float32 tensor of shape [T, B] representing the + log importance sampling weights, i.e. + log(target_policy(a) / behaviour_policy(a)). V-trace performs operations + on rhos in log-space for numerical stability. + discounts: A float32 tensor of shape [T, B] with discounts encountered when + following the behaviour policy. + rewards: A float32 tensor of shape [T, B] containing rewards generated by + following the behaviour policy. + values: A float32 tensor of shape [T, B] with the value function estimates + wrt. the target policy. + bootstrap_value: A float32 of shape [B] with the value function estimate at + time T. + clip_rho_threshold: A scalar float32 tensor with the clipping threshold for + importance weights (rho) when calculating the baseline targets (vs). + rho^bar in the paper. If None, no clipping is applied. + clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold + on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). If + None, no clipping is applied. + name: The name scope that all V-trace operations will be created in. + + Returns: + A VTraceReturns namedtuple (vs, pg_advantages) where: + vs: A float32 tensor of shape [T, B]. Can be used as target to + train a baseline (V(x_t) - vs_t)^2. + pg_advantages: A float32 tensor of shape [T, B]. Can be used as the + advantage in the calculation of policy gradients. + """ + log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32) + discounts = tf.convert_to_tensor(discounts, dtype=tf.float32) + rewards = tf.convert_to_tensor(rewards, dtype=tf.float32) + values = tf.convert_to_tensor(values, dtype=tf.float32) + bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32) + if clip_rho_threshold is not None: + clip_rho_threshold = tf.convert_to_tensor(clip_rho_threshold, dtype=tf.float32) + if clip_pg_rho_threshold is not None: + clip_pg_rho_threshold = tf.convert_to_tensor( + clip_pg_rho_threshold, dtype=tf.float32 + ) + + # Make sure tensor ranks are consistent. + rho_rank = log_rhos.shape.ndims # Usually 2. + values.shape.assert_has_rank(rho_rank) + bootstrap_value.shape.assert_has_rank(rho_rank - 1) + discounts.shape.assert_has_rank(rho_rank) + rewards.shape.assert_has_rank(rho_rank) + if clip_rho_threshold is not None: + clip_rho_threshold.shape.assert_has_rank(0) + if clip_pg_rho_threshold is not None: + clip_pg_rho_threshold.shape.assert_has_rank(0) + + with tf1.name_scope( + name, values=[log_rhos, discounts, rewards, values, bootstrap_value] + ): + rhos = tf.math.exp(log_rhos) + if clip_rho_threshold is not None: + clipped_rhos = tf.minimum(clip_rho_threshold, rhos, name="clipped_rhos") + else: + clipped_rhos = rhos + + cs = tf.minimum(1.0, rhos, name="cs") + # Append bootstrapped value to get [v1, ..., v_t+1] + values_t_plus_1 = tf.concat( + [values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0 + ) + deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values) + + # All sequences are reversed, computation starts from the back. + sequences = ( + tf.reverse(discounts, axis=[0]), + tf.reverse(cs, axis=[0]), + tf.reverse(deltas, axis=[0]), + ) + + # V-trace vs are calculated through a scan from the back to the + # beginning of the given trajectory. + def scanfunc(acc, sequence_item): + discount_t, c_t, delta_t = sequence_item + return delta_t + discount_t * c_t * acc + + initial_values = tf.zeros_like(bootstrap_value) + vs_minus_v_xs = tf.nest.map_structure( + tf.stop_gradient, + tf.scan( + fn=scanfunc, + elems=sequences, + initializer=initial_values, + parallel_iterations=1, + name="scan", + ), + ) + # Reverse the results back to original order. + vs_minus_v_xs = tf.reverse(vs_minus_v_xs, [0], name="vs_minus_v_xs") + + # Add V(x_s) to get v_s. + vs = tf.add(vs_minus_v_xs, values, name="vs") + + # Advantage for policy gradient. + vs_t_plus_1 = tf.concat([vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0) + if clip_pg_rho_threshold is not None: + clipped_pg_rhos = tf.minimum( + clip_pg_rho_threshold, rhos, name="clipped_pg_rhos" + ) + else: + clipped_pg_rhos = rhos + pg_advantages = clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values) + + # Make sure no gradients backpropagated through the returned values. + return VTraceReturns( + vs=tf.stop_gradient(vs), pg_advantages=tf.stop_gradient(pg_advantages) + ) + + +def get_log_rhos(target_action_log_probs, behaviour_action_log_probs): + """With the selected log_probs for multi-discrete actions of behaviour + and target policies we compute the log_rhos for calculating the vtrace.""" + t = tf.stack(target_action_log_probs) + b = tf.stack(behaviour_action_log_probs) + log_rhos = tf.reduce_sum(t - b, axis=0) + return log_rhos diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_torch.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..b63a5181c7ac81e3ccd10477f7b6728a66cfe72b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/vtrace_torch.py @@ -0,0 +1,359 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch version of the functions to compute V-trace off-policy actor critic +targets. + +For details and theory see: + +"IMPALA: Scalable Distributed Deep-RL with +Importance Weighted Actor-Learner Architectures" +by Espeholt, Soyer, Munos et al. + +See https://arxiv.org/abs/1802.01561 for the full paper. + +In addition to the original paper's code, changes have been made +to support MultiDiscrete action spaces. behaviour_policy_logits, +target_policy_logits and actions parameters in the entry point +multi_from_logits method accepts lists of tensors instead of just +tensors. +""" + +from ray.rllib.algorithms.impala.vtrace_tf import VTraceFromLogitsReturns, VTraceReturns +from ray.rllib.models.torch.torch_action_dist import TorchCategorical +from ray.rllib.utils import force_list +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.torch_utils import convert_to_torch_tensor + +torch, nn = try_import_torch() + + +def log_probs_from_logits_and_actions( + policy_logits, actions, dist_class=TorchCategorical, model=None +): + return multi_log_probs_from_logits_and_actions( + [policy_logits], [actions], dist_class, model + )[0] + + +def multi_log_probs_from_logits_and_actions(policy_logits, actions, dist_class, model): + """Computes action log-probs from policy logits and actions. + + In the notation used throughout documentation and comments, T refers to the + time dimension ranging from 0 to T-1. B refers to the batch size and + ACTION_SPACE refers to the list of numbers each representing a number of + actions. + + Args: + policy_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes [T, B, ACTION_SPACE[0]], ..., + [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities + parameterizing a softmax policy. + actions: A list with length of ACTION_SPACE of tensors of shapes + [T, B, ...], ..., [T, B, ...] + with actions. + dist_class: Python class of the action distribution. + + Returns: + A list with length of ACTION_SPACE of float32 tensors of shapes + [T, B], ..., [T, B] corresponding to the sampling log probability + of the chosen action w.r.t. the policy. + """ + log_probs = [] + for i in range(len(policy_logits)): + p_shape = policy_logits[i].shape + a_shape = actions[i].shape + policy_logits_flat = torch.reshape(policy_logits[i], (-1,) + tuple(p_shape[2:])) + actions_flat = torch.reshape(actions[i], (-1,) + tuple(a_shape[2:])) + log_probs.append( + torch.reshape( + dist_class(policy_logits_flat, model).logp(actions_flat), a_shape[:2] + ) + ) + + return log_probs + + +def from_logits( + behaviour_policy_logits, + target_policy_logits, + actions, + discounts, + rewards, + values, + bootstrap_value, + dist_class=TorchCategorical, + model=None, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, +): + """multi_from_logits wrapper used only for tests""" + + res = multi_from_logits( + [behaviour_policy_logits], + [target_policy_logits], + [actions], + discounts, + rewards, + values, + bootstrap_value, + dist_class, + model, + clip_rho_threshold=clip_rho_threshold, + clip_pg_rho_threshold=clip_pg_rho_threshold, + ) + + assert len(res.behaviour_action_log_probs) == 1 + assert len(res.target_action_log_probs) == 1 + return VTraceFromLogitsReturns( + vs=res.vs, + pg_advantages=res.pg_advantages, + log_rhos=res.log_rhos, + behaviour_action_log_probs=res.behaviour_action_log_probs[0], + target_action_log_probs=res.target_action_log_probs[0], + ) + + +def multi_from_logits( + behaviour_policy_logits, + target_policy_logits, + actions, + discounts, + rewards, + values, + bootstrap_value, + dist_class, + model, + behaviour_action_log_probs=None, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, +): + r"""V-trace for softmax policies. + + Calculates V-trace actor critic targets for softmax polices as described in + + "IMPALA: Scalable Distributed Deep-RL with + Importance Weighted Actor-Learner Architectures" + by Espeholt, Soyer, Munos et al. + + Target policy refers to the policy we are interested in improving and + behaviour policy refers to the policy that generated the given + rewards and actions. + + In the notation used throughout documentation and comments, T refers to the + time dimension ranging from 0 to T-1. B refers to the batch size and + ACTION_SPACE refers to the list of numbers each representing a number of + actions. + + Args: + behaviour_policy_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes [T, B, ACTION_SPACE[0]], ..., + [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities + parameterizing the softmax behavior policy. + target_policy_logits: A list with length of ACTION_SPACE of float32 + tensors of shapes [T, B, ACTION_SPACE[0]], ..., + [T, B, ACTION_SPACE[-1]] with un-normalized log-probabilities + parameterizing the softmax target policy. + actions: A list with length of ACTION_SPACE of tensors of shapes + [T, B, ...], ..., [T, B, ...] + with actions sampled from the behavior policy. + discounts: A float32 tensor of shape [T, B] with the discount + encountered when following the behavior policy. + rewards: A float32 tensor of shape [T, B] with the rewards generated by + following the behavior policy. + values: A float32 tensor of shape [T, B] with the value function + estimates wrt. the target policy. + bootstrap_value: A float32 of shape [B] with the value function + estimate at time T. + dist_class: action distribution class for the logits. + model: backing ModelV2 instance + behaviour_action_log_probs: Precalculated values of the behavior + actions. + clip_rho_threshold: A scalar float32 tensor with the clipping threshold + for importance weights (rho) when calculating the baseline targets + (vs). rho^bar in the paper. + clip_pg_rho_threshold: A scalar float32 tensor with the clipping + threshold on rho_s in: + \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). + + Returns: + A `VTraceFromLogitsReturns` namedtuple with the following fields: + vs: A float32 tensor of shape [T, B]. Can be used as target to train a + baseline (V(x_t) - vs_t)^2. + pg_advantages: A float 32 tensor of shape [T, B]. Can be used as an + estimate of the advantage in the calculation of policy gradients. + log_rhos: A float32 tensor of shape [T, B] containing the log + importance sampling weights (log rhos). + behaviour_action_log_probs: A float32 tensor of shape [T, B] containing + behaviour policy action log probabilities (log \mu(a_t)). + target_action_log_probs: A float32 tensor of shape [T, B] containing + target policy action probabilities (log \pi(a_t)). + """ + + behaviour_policy_logits = convert_to_torch_tensor( + behaviour_policy_logits, device="cpu" + ) + target_policy_logits = convert_to_torch_tensor(target_policy_logits, device="cpu") + actions = convert_to_torch_tensor(actions, device="cpu") + + # Make sure tensor ranks are as expected. + # The rest will be checked by from_action_log_probs. + for i in range(len(behaviour_policy_logits)): + assert len(behaviour_policy_logits[i].size()) == 3 + assert len(target_policy_logits[i].size()) == 3 + + target_action_log_probs = multi_log_probs_from_logits_and_actions( + target_policy_logits, actions, dist_class, model + ) + + if len(behaviour_policy_logits) > 1 or behaviour_action_log_probs is None: + # can't use precalculated values, recompute them. Note that + # recomputing won't work well for autoregressive action dists + # which may have variables not captured by 'logits' + behaviour_action_log_probs = multi_log_probs_from_logits_and_actions( + behaviour_policy_logits, actions, dist_class, model + ) + + behaviour_action_log_probs = convert_to_torch_tensor( + behaviour_action_log_probs, device="cpu" + ) + behaviour_action_log_probs = force_list(behaviour_action_log_probs) + # log_rhos = target_logp - behavior_logp + log_rhos = get_log_rhos(target_action_log_probs, behaviour_action_log_probs) + + vtrace_returns = from_importance_weights( + log_rhos=log_rhos, + discounts=discounts, + rewards=rewards, + values=values, + bootstrap_value=bootstrap_value, + clip_rho_threshold=clip_rho_threshold, + clip_pg_rho_threshold=clip_pg_rho_threshold, + ) + + return VTraceFromLogitsReturns( + log_rhos=log_rhos, + behaviour_action_log_probs=behaviour_action_log_probs, + target_action_log_probs=target_action_log_probs, + **vtrace_returns._asdict() + ) + + +def from_importance_weights( + log_rhos, + discounts, + rewards, + values, + bootstrap_value, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, +): + r"""V-trace from log importance weights. + + Calculates V-trace actor critic targets as described in + + "IMPALA: Scalable Distributed Deep-RL with + Importance Weighted Actor-Learner Architectures" + by Espeholt, Soyer, Munos et al. + + In the notation used throughout documentation and comments, T refers to the + time dimension ranging from 0 to T-1. B refers to the batch size. This code + also supports the case where all tensors have the same number of additional + dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C], + `bootstrap_value` is [B, C]. + + Args: + log_rhos: A float32 tensor of shape [T, B] representing the log + importance sampling weights, i.e. + log(target_policy(a) / behaviour_policy(a)). V-trace performs + operations on rhos in log-space for numerical stability. + discounts: A float32 tensor of shape [T, B] with discounts encountered + when following the behaviour policy. + rewards: A float32 tensor of shape [T, B] containing rewards generated + by following the behaviour policy. + values: A float32 tensor of shape [T, B] with the value function + estimates wrt. the target policy. + bootstrap_value: A float32 of shape [B] with the value function + estimate at time T. + clip_rho_threshold: A scalar float32 tensor with the clipping threshold + for importance weights (rho) when calculating the baseline targets + (vs). rho^bar in the paper. If None, no clipping is applied. + clip_pg_rho_threshold: A scalar float32 tensor with the clipping + threshold on rho_s in + \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). + If None, no clipping is applied. + + Returns: + A VTraceReturns namedtuple (vs, pg_advantages) where: + vs: A float32 tensor of shape [T, B]. Can be used as target to + train a baseline (V(x_t) - vs_t)^2. + pg_advantages: A float32 tensor of shape [T, B]. Can be used as the + advantage in the calculation of policy gradients. + """ + log_rhos = convert_to_torch_tensor(log_rhos, device="cpu") + discounts = convert_to_torch_tensor(discounts, device="cpu") + rewards = convert_to_torch_tensor(rewards, device="cpu") + values = convert_to_torch_tensor(values, device="cpu") + bootstrap_value = convert_to_torch_tensor(bootstrap_value, device="cpu") + + # Make sure tensor ranks are consistent. + rho_rank = len(log_rhos.size()) # Usually 2. + assert rho_rank == len(values.size()) + assert rho_rank - 1 == len(bootstrap_value.size()), "must have rank {}".format( + rho_rank - 1 + ) + assert rho_rank == len(discounts.size()) + assert rho_rank == len(rewards.size()) + + rhos = torch.exp(log_rhos) + if clip_rho_threshold is not None: + clipped_rhos = torch.clamp_max(rhos, clip_rho_threshold) + else: + clipped_rhos = rhos + + cs = torch.clamp_max(rhos, 1.0) + # Append bootstrapped value to get [v1, ..., v_t+1] + values_t_plus_1 = torch.cat( + [values[1:], torch.unsqueeze(bootstrap_value, 0)], dim=0 + ) + deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values) + + vs_minus_v_xs = [torch.zeros_like(bootstrap_value)] + for i in reversed(range(len(discounts))): + discount_t, c_t, delta_t = discounts[i], cs[i], deltas[i] + vs_minus_v_xs.append(delta_t + discount_t * c_t * vs_minus_v_xs[-1]) + vs_minus_v_xs = torch.stack(vs_minus_v_xs[1:]) + # Reverse the results back to original order. + vs_minus_v_xs = torch.flip(vs_minus_v_xs, dims=[0]) + # Add V(x_s) to get v_s. + vs = vs_minus_v_xs + values + + # Advantage for policy gradient. + vs_t_plus_1 = torch.cat([vs[1:], torch.unsqueeze(bootstrap_value, 0)], dim=0) + if clip_pg_rho_threshold is not None: + clipped_pg_rhos = torch.clamp_max(rhos, clip_pg_rho_threshold) + else: + clipped_pg_rhos = rhos + pg_advantages = clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values) + + # Make sure no gradients backpropagated through the returned values. + return VTraceReturns(vs=vs.detach(), pg_advantages=pg_advantages.detach()) + + +def get_log_rhos(target_action_log_probs, behaviour_action_log_probs): + """With the selected log_probs for multi-discrete actions of behavior + and target policies we compute the log_rhos for calculating the vtrace.""" + t = torch.stack(target_action_log_probs) + b = torch.stack(behaviour_action_log_probs) + log_rhos = torch.sum(t - b, dim=0) + return log_rhos