koichi12 commited on Feb 12, 2025

Commit

30f24c0

verified ·

1 Parent(s): adce983

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/callbacks.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/mock.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/registry.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/algorithm_config.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/torch/__pycache__/default_bc_torch_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/callbacks.py +8 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__init__.py +9 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__pycache__/cql.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__pycache__/cql_tf_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__pycache__/cql_torch_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/cql.py +388 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/cql_tf_policy.py +426 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/cql_torch_policy.py +406 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/__pycache__/cql_torch_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/__pycache__/default_cql_torch_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/cql_torch_learner.py +275 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/default_cql_torch_rl_module.py +206 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__init__.py +15 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/dreamerv3.py +750 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/dreamerv3_catalog.py +80 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/dreamerv3_learner.py +31 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py +153 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/__pycache__/dreamerv3_tf_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/__pycache__/dreamerv3_tf_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py +915 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py +23 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/actor_network.py +203 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py +112 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py +187 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py +98 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/critic_network.py +177 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py +94 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py +606 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/world_model.py +407 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__init__.py +12 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__pycache__/default_ppo_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__pycache__/ppo.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__pycache__/ppo_catalog.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -173,3 +173,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/idna/__pycache__/idnadata.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/idna/__pycache__/idnadata.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc96e86e5e36ee78f9cfcd3d87220524f3cb583ba7b0472482fe408fbc1c57fa
+size 114677

.venv/lib/python3.11/site-packages/ray/_private/__pycache__/worker.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e715fb00f3b4360472455b9c5d37eb8337c42bc50fea95d2d75fa67bebdcb096
+size 158454

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.39 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/callbacks.cpython-311.pyc ADDED Viewed

Binary file (424 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/mock.cpython-311.pyc ADDED Viewed

Binary file (8.29 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/registry.cpython-311.pyc ADDED Viewed

Binary file (6.39 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (5.86 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/algorithm_config.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/bc/torch/__pycache__/default_bc_torch_rl_module.cpython-311.pyc ADDED Viewed

Binary file (3.16 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/callbacks.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# @OldAPIStack
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.callbacks.utils import _make_multi_callbacks
+# Backward compatibility
+DefaultCallbacks = RLlibCallback
+make_multi_callbacks = _make_multi_callbacks

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ray.rllib.algorithms.cql.cql import CQL, CQLConfig
+from ray.rllib.algorithms.cql.cql_torch_policy import CQLTorchPolicy
+__all__ = [
+    "CQL",
+    "CQLConfig",
+    # @OldAPIStack
+    "CQLTorchPolicy",
+]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (438 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__pycache__/cql.cpython-311.pyc ADDED Viewed

Binary file (17.9 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__pycache__/cql_tf_policy.cpython-311.pyc ADDED Viewed

Binary file (20.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/__pycache__/cql_torch_policy.cpython-311.pyc ADDED Viewed

Binary file (19.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/cql.py ADDED Viewed

	@@ -0,0 +1,388 @@

+import logging
+from typing import Optional, Type, Union
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.algorithms.cql.cql_tf_policy import CQLTFPolicy
+from ray.rllib.algorithms.cql.cql_torch_policy import CQLTorchPolicy
+from ray.rllib.algorithms.sac.sac import (
+    SAC,
+    SACConfig,
+)
+from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import (
+    AddObservationsFromEpisodesToBatch,
+)
+from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import (  # noqa
+    AddNextObservationsFromEpisodesToTrainBatch,
+)
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.execution.rollout_ops import (
+    synchronous_parallel_sample,
+)
+from ray.rllib.execution.train_ops import (
+    multi_gpu_train_one_step,
+    train_one_step,
+)
+from ray.rllib.policy.policy import Policy
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.deprecation import (
+    DEPRECATED_VALUE,
+    deprecation_warning,
+)
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+from ray.rllib.utils.metrics import (
+    ALL_MODULES,
+    LEARNER_RESULTS,
+    LEARNER_UPDATE_TIMER,
+    LAST_TARGET_UPDATE_TS,
+    NUM_AGENT_STEPS_SAMPLED,
+    NUM_AGENT_STEPS_TRAINED,
+    NUM_ENV_STEPS_SAMPLED,
+    NUM_ENV_STEPS_TRAINED,
+    NUM_TARGET_UPDATES,
+    OFFLINE_SAMPLING_TIMER,
+    TARGET_NET_UPDATE_TIMER,
+    SYNCH_WORKER_WEIGHTS_TIMER,
+    SAMPLE_TIMER,
+    TIMERS,
+)
+from ray.rllib.utils.typing import ResultDict, RLModuleSpecType
+tf1, tf, tfv = try_import_tf()
+tfp = try_import_tfp()
+logger = logging.getLogger(__name__)
+class CQLConfig(SACConfig):
+    """Defines a configuration class from which a CQL can be built.
+    .. testcode::
+        :skipif: True
+        from ray.rllib.algorithms.cql import CQLConfig
+        config = CQLConfig().training(gamma=0.9, lr=0.01)
+        config = config.resources(num_gpus=0)
+        config = config.env_runners(num_env_runners=4)
+        print(config.to_dict())
+        # Build a Algorithm object from the config and run 1 training iteration.
+        algo = config.build(env="CartPole-v1")
+        algo.train()
+    """
+    def __init__(self, algo_class=None):
+        super().__init__(algo_class=algo_class or CQL)
+        # fmt: off
+        # __sphinx_doc_begin__
+        # CQL-specific config settings:
+        self.bc_iters = 20000
+        self.temperature = 1.0
+        self.num_actions = 10
+        self.lagrangian = False
+        self.lagrangian_thresh = 5.0
+        self.min_q_weight = 5.0
+        self.deterministic_backup = True
+        self.lr = 3e-4
+        # Note, the new stack defines learning rates for each component.
+        # The base learning rate `lr` has to be set to `None`, if using
+        # the new stack.
+        self.actor_lr = 1e-4
+        self.critic_lr = 1e-3
+        self.alpha_lr = 1e-3
+        self.replay_buffer_config = {
+            "_enable_replay_buffer_api": True,
+            "type": "MultiAgentPrioritizedReplayBuffer",
+            "capacity": int(1e6),
+            # If True prioritized replay buffer will be used.
+            "prioritized_replay": False,
+            "prioritized_replay_alpha": 0.6,
+            "prioritized_replay_beta": 0.4,
+            "prioritized_replay_eps": 1e-6,
+            # Whether to compute priorities already on the remote worker side.
+            "worker_side_prioritization": False,
+        }
+        # Changes to Algorithm's/SACConfig's default:
+        # .reporting()
+        self.min_sample_timesteps_per_iteration = 0
+        self.min_train_timesteps_per_iteration = 100
+        # fmt: on
+        # __sphinx_doc_end__
+        self.timesteps_per_iteration = DEPRECATED_VALUE
+    @override(SACConfig)
+    def training(
+        self,
+        *,
+        bc_iters: Optional[int] = NotProvided,
+        temperature: Optional[float] = NotProvided,
+        num_actions: Optional[int] = NotProvided,
+        lagrangian: Optional[bool] = NotProvided,
+        lagrangian_thresh: Optional[float] = NotProvided,
+        min_q_weight: Optional[float] = NotProvided,
+        deterministic_backup: Optional[bool] = NotProvided,
+        **kwargs,
+    ) -> "CQLConfig":
+        """Sets the training-related configuration.
+        Args:
+            bc_iters: Number of iterations with Behavior Cloning pretraining.
+            temperature: CQL loss temperature.
+            num_actions: Number of actions to sample for CQL loss
+            lagrangian: Whether to use the Lagrangian for Alpha Prime (in CQL loss).
+            lagrangian_thresh: Lagrangian threshold.
+            min_q_weight: in Q weight multiplier.
+            deterministic_backup: If the target in the Bellman update should have an
+                entropy backup. Defaults to `True`.
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+        if bc_iters is not NotProvided:
+            self.bc_iters = bc_iters
+        if temperature is not NotProvided:
+            self.temperature = temperature
+        if num_actions is not NotProvided:
+            self.num_actions = num_actions
+        if lagrangian is not NotProvided:
+            self.lagrangian = lagrangian
+        if lagrangian_thresh is not NotProvided:
+            self.lagrangian_thresh = lagrangian_thresh
+        if min_q_weight is not NotProvided:
+            self.min_q_weight = min_q_weight
+        if deterministic_backup is not NotProvided:
+            self.deterministic_backup = deterministic_backup
+        return self
+    @override(AlgorithmConfig)
+    def offline_data(self, **kwargs) -> "CQLConfig":
+        super().offline_data(**kwargs)
+        # Check, if the passed in class incorporates the `OfflinePreLearner`
+        # interface.
+        if "prelearner_class" in kwargs:
+            from ray.rllib.offline.offline_data import OfflinePreLearner
+            if not issubclass(kwargs.get("prelearner_class"), OfflinePreLearner):
+                raise ValueError(
+                    f"`prelearner_class` {kwargs.get('prelearner_class')} is not a "
+                    "subclass of `OfflinePreLearner`. Any class passed to "
+                    "`prelearner_class` needs to implement the interface given by "
+                    "`OfflinePreLearner`."
+                )
+        return self
+    @override(SACConfig)
+    def get_default_learner_class(self) -> Union[Type["Learner"], str]:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.cql.torch.cql_torch_learner import CQLTorchLearner
+            return CQLTorchLearner
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. "
+                "Use `'torch'` instead."
+            )
+    @override(AlgorithmConfig)
+    def build_learner_connector(
+        self,
+        input_observation_space,
+        input_action_space,
+        device=None,
+    ):
+        pipeline = super().build_learner_connector(
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+            device=device,
+        )
+        # Prepend the "add-NEXT_OBS-from-episodes-to-train-batch" connector piece (right
+        # after the corresponding "add-OBS-..." default piece).
+        pipeline.insert_after(
+            AddObservationsFromEpisodesToBatch,
+            AddNextObservationsFromEpisodesToTrainBatch(),
+        )
+        return pipeline
+    @override(SACConfig)
+    def validate(self) -> None:
+        # First check, whether old `timesteps_per_iteration` is used.
+        if self.timesteps_per_iteration != DEPRECATED_VALUE:
+            deprecation_warning(
+                old="timesteps_per_iteration",
+                new="min_train_timesteps_per_iteration",
+                error=True,
+            )
+        # Call super's validation method.
+        super().validate()
+        # CQL-torch performs the optimizer steps inside the loss function.
+        # Using the multi-GPU optimizer will therefore not work (see multi-GPU
+        # check above) and we must use the simple optimizer for now.
+        if self.simple_optimizer is not True and self.framework_str == "torch":
+            self.simple_optimizer = True
+        if self.framework_str in ["tf", "tf2"] and tfp is None:
+            logger.warning(
+                "You need `tensorflow_probability` in order to run CQL! "
+                "Install it via `pip install tensorflow_probability`. Your "
+                f"tf.__version__={tf.__version__ if tf else None}."
+                "Trying to import tfp results in the following error:"
+            )
+            try_import_tfp(error=True)
+        # Assert that for a local learner the number of iterations is 1. Note,
+        # this is needed because we have no iterators, but instead a single
+        # batch returned directly from the `OfflineData.sample` method.
+        if (
+            self.num_learners == 0
+            and not self.dataset_num_iters_per_learner
+            and self.enable_rl_module_and_learner
+        ):
+            self._value_error(
+                "When using a single local learner the number of iterations "
+                "per learner, `dataset_num_iters_per_learner` has to be defined. "
+                "Set this hyperparameter in the `AlgorithmConfig.offline_data`."
+            )
+    @override(SACConfig)
+    def get_default_rl_module_spec(self) -> RLModuleSpecType:
+        if self.framework_str == "torch":
+            from ray.rllib.algorithms.cql.torch.default_cql_torch_rl_module import (
+                DefaultCQLTorchRLModule,
+            )
+            return RLModuleSpec(module_class=DefaultCQLTorchRLModule)
+        else:
+            raise ValueError(
+                f"The framework {self.framework_str} is not supported. " "Use `torch`."
+            )
+    @property
+    def _model_config_auto_includes(self):
+        return super()._model_config_auto_includes | {
+            "num_actions": self.num_actions,
+        }
+class CQL(SAC):
+    """CQL (derived from SAC)."""
+    @classmethod
+    @override(SAC)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return CQLConfig()
+    @classmethod
+    @override(SAC)
+    def get_default_policy_class(
+        cls, config: AlgorithmConfig
+    ) -> Optional[Type[Policy]]:
+        if config["framework"] == "torch":
+            return CQLTorchPolicy
+        else:
+            return CQLTFPolicy
+    @override(SAC)
+    def training_step(self) -> None:
+        # Old API stack (Policy, RolloutWorker, Connector).
+        if not self.config.enable_env_runner_and_connector_v2:
+            return self._training_step_old_api_stack()
+        # Sampling from offline data.
+        with self.metrics.log_time((TIMERS, OFFLINE_SAMPLING_TIMER)):
+            # Return an iterator in case we are using remote learners.
+            batch_or_iterator = self.offline_data.sample(
+                num_samples=self.config.train_batch_size_per_learner,
+                num_shards=self.config.num_learners,
+                return_iterator=self.config.num_learners > 1,
+            )
+        # Updating the policy.
+        with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
+            # TODO (simon, sven): Check, if we should execute directly s.th. like
+            #  `LearnerGroup.update_from_iterator()`.
+            learner_results = self.learner_group._update(
+                batch=batch_or_iterator,
+                minibatch_size=self.config.train_batch_size_per_learner,
+                num_iters=self.config.dataset_num_iters_per_learner,
+            )
+            # Log training results.
+            self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
+        # Synchronize weights.
+        # As the results contain for each policy the loss and in addition the
+        # total loss over all policies is returned, this total loss has to be
+        # removed.
+        modules_to_update = set(learner_results[0].keys()) - {ALL_MODULES}
+        if self.eval_env_runner_group:
+            # Update weights - after learning on the local worker -
+            # on all remote workers. Note, we only have the local `EnvRunner`,
+            # but from this `EnvRunner` the evaulation `EnvRunner`s get updated.
+            with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
+                self.eval_env_runner_group.sync_weights(
+                    # Sync weights from learner_group to all EnvRunners.
+                    from_worker_or_learner_group=self.learner_group,
+                    policies=modules_to_update,
+                    inference_only=True,
+                )
+    @OldAPIStack
+    def _training_step_old_api_stack(self) -> ResultDict:
+        # Collect SampleBatches from sample workers.
+        with self._timers[SAMPLE_TIMER]:
+            train_batch = synchronous_parallel_sample(worker_set=self.env_runner_group)
+        train_batch = train_batch.as_multi_agent()
+        self._counters[NUM_AGENT_STEPS_SAMPLED] += train_batch.agent_steps()
+        self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps()
+        # Postprocess batch before we learn on it.
+        post_fn = self.config.get("before_learn_on_batch") or (lambda b, *a: b)
+        train_batch = post_fn(train_batch, self.env_runner_group, self.config)
+        # Learn on training batch.
+        # Use simple optimizer (only for multi-agent or tf-eager; all other
+        # cases should use the multi-GPU optimizer, even if only using 1 GPU)
+        if self.config.get("simple_optimizer") is True:
+            train_results = train_one_step(self, train_batch)
+        else:
+            train_results = multi_gpu_train_one_step(self, train_batch)
+        # Update target network every `target_network_update_freq` training steps.
+        cur_ts = self._counters[
+            NUM_AGENT_STEPS_TRAINED
+            if self.config.count_steps_by == "agent_steps"
+            else NUM_ENV_STEPS_TRAINED
+        ]
+        last_update = self._counters[LAST_TARGET_UPDATE_TS]
+        if cur_ts - last_update >= self.config.target_network_update_freq:
+            with self._timers[TARGET_NET_UPDATE_TIMER]:
+                to_update = self.env_runner.get_policies_to_train()
+                self.env_runner.foreach_policy_to_train(
+                    lambda p, pid: pid in to_update and p.update_target()
+                )
+            self._counters[NUM_TARGET_UPDATES] += 1
+            self._counters[LAST_TARGET_UPDATE_TS] = cur_ts
+        # Update remote workers's weights after learning on local worker
+        # (only those policies that were actually trained).
+        if self.env_runner_group.num_remote_workers() > 0:
+            with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]:
+                self.env_runner_group.sync_weights(policies=list(train_results.keys()))
+        # Return all collected metrics for the iteration.
+        return train_results

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/cql_tf_policy.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""
+TensorFlow policy class used for CQL.
+"""
+from functools import partial
+import numpy as np
+import gymnasium as gym
+import logging
+import tree
+from typing import Dict, List, Type, Union
+import ray
+import ray.experimental.tf_utils
+from ray.rllib.algorithms.sac.sac_tf_policy import (
+    apply_gradients as sac_apply_gradients,
+    compute_and_clip_gradients as sac_compute_and_clip_gradients,
+    get_distribution_inputs_and_class,
+    _get_dist_class,
+    build_sac_model,
+    postprocess_trajectory,
+    setup_late_mixins,
+    stats,
+    validate_spaces,
+    ActorCriticOptimizerMixin as SACActorCriticOptimizerMixin,
+    ComputeTDErrorMixin,
+)
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_action_dist import TFActionDistribution
+from ray.rllib.policy.tf_mixins import TargetNetworkMixin
+from ray.rllib.policy.tf_policy_template import build_tf_policy
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.exploration.random import Random
+from ray.rllib.utils.framework import get_variable, try_import_tf, try_import_tfp
+from ray.rllib.utils.typing import (
+    LocalOptimizer,
+    ModelGradients,
+    TensorType,
+    AlgorithmConfigDict,
+)
+tf1, tf, tfv = try_import_tf()
+tfp = try_import_tfp()
+logger = logging.getLogger(__name__)
+MEAN_MIN = -9.0
+MEAN_MAX = 9.0
+def _repeat_tensor(t: TensorType, n: int):
+    # Insert new axis at position 1 into tensor t
+    t_rep = tf.expand_dims(t, 1)
+    # Repeat tensor t_rep along new axis n times
+    multiples = tf.concat([[1, n], tf.tile([1], tf.expand_dims(tf.rank(t) - 1, 0))], 0)
+    t_rep = tf.tile(t_rep, multiples)
+    # Merge new axis into batch axis
+    t_rep = tf.reshape(t_rep, tf.concat([[-1], tf.shape(t)[1:]], 0))
+    return t_rep
+# Returns policy tiled actions and log probabilities for CQL Loss
+def policy_actions_repeat(model, action_dist, obs, num_repeat=1):
+    batch_size = tf.shape(tree.flatten(obs)[0])[0]
+    obs_temp = tree.map_structure(lambda t: _repeat_tensor(t, num_repeat), obs)
+    logits, _ = model.get_action_model_outputs(obs_temp)
+    policy_dist = action_dist(logits, model)
+    actions, logp_ = policy_dist.sample_logp()
+    logp = tf.expand_dims(logp_, -1)
+    return actions, tf.reshape(logp, [batch_size, num_repeat, 1])
+def q_values_repeat(model, obs, actions, twin=False):
+    action_shape = tf.shape(actions)[0]
+    obs_shape = tf.shape(tree.flatten(obs)[0])[0]
+    num_repeat = action_shape // obs_shape
+    obs_temp = tree.map_structure(lambda t: _repeat_tensor(t, num_repeat), obs)
+    if not twin:
+        preds_, _ = model.get_q_values(obs_temp, actions)
+    else:
+        preds_, _ = model.get_twin_q_values(obs_temp, actions)
+    preds = tf.reshape(preds_, [obs_shape, num_repeat, 1])
+    return preds
+def cql_loss(
+    policy: Policy,
+    model: ModelV2,
+    dist_class: Type[TFActionDistribution],
+    train_batch: SampleBatch,
+) -> Union[TensorType, List[TensorType]]:
+    logger.info(f"Current iteration = {policy.cur_iter}")
+    policy.cur_iter += 1
+    # For best performance, turn deterministic off
+    deterministic = policy.config["_deterministic_loss"]
+    assert not deterministic
+    twin_q = policy.config["twin_q"]
+    discount = policy.config["gamma"]
+    # CQL Parameters
+    bc_iters = policy.config["bc_iters"]
+    cql_temp = policy.config["temperature"]
+    num_actions = policy.config["num_actions"]
+    min_q_weight = policy.config["min_q_weight"]
+    use_lagrange = policy.config["lagrangian"]
+    target_action_gap = policy.config["lagrangian_thresh"]
+    obs = train_batch[SampleBatch.CUR_OBS]
+    actions = tf.cast(train_batch[SampleBatch.ACTIONS], tf.float32)
+    rewards = tf.cast(train_batch[SampleBatch.REWARDS], tf.float32)
+    next_obs = train_batch[SampleBatch.NEXT_OBS]
+    terminals = train_batch[SampleBatch.TERMINATEDS]
+    model_out_t, _ = model(SampleBatch(obs=obs, _is_training=True), [], None)
+    model_out_tp1, _ = model(SampleBatch(obs=next_obs, _is_training=True), [], None)
+    target_model_out_tp1, _ = policy.target_model(
+        SampleBatch(obs=next_obs, _is_training=True), [], None
+    )
+    action_dist_class = _get_dist_class(policy, policy.config, policy.action_space)
+    action_dist_inputs_t, _ = model.get_action_model_outputs(model_out_t)
+    action_dist_t = action_dist_class(action_dist_inputs_t, model)
+    policy_t, log_pis_t = action_dist_t.sample_logp()
+    log_pis_t = tf.expand_dims(log_pis_t, -1)
+    # Unlike original SAC, Alpha and Actor Loss are computed first.
+    # Alpha Loss
+    alpha_loss = -tf.reduce_mean(
+        model.log_alpha * tf.stop_gradient(log_pis_t + model.target_entropy)
+    )
+    # Policy Loss (Either Behavior Clone Loss or SAC Loss)
+    alpha = tf.math.exp(model.log_alpha)
+    if policy.cur_iter >= bc_iters:
+        min_q, _ = model.get_q_values(model_out_t, policy_t)
+        if twin_q:
+            twin_q_, _ = model.get_twin_q_values(model_out_t, policy_t)
+            min_q = tf.math.minimum(min_q, twin_q_)
+        actor_loss = tf.reduce_mean(tf.stop_gradient(alpha) * log_pis_t - min_q)
+    else:
+        bc_logp = action_dist_t.logp(actions)
+        actor_loss = tf.reduce_mean(tf.stop_gradient(alpha) * log_pis_t - bc_logp)
+        # actor_loss = -tf.reduce_mean(bc_logp)
+    # Critic Loss (Standard SAC Critic L2 Loss + CQL Entropy Loss)
+    # SAC Loss:
+    # Q-values for the batched actions.
+    action_dist_inputs_tp1, _ = model.get_action_model_outputs(model_out_tp1)
+    action_dist_tp1 = action_dist_class(action_dist_inputs_tp1, model)
+    policy_tp1, _ = action_dist_tp1.sample_logp()
+    q_t, _ = model.get_q_values(model_out_t, actions)
+    q_t_selected = tf.squeeze(q_t, axis=-1)
+    if twin_q:
+        twin_q_t, _ = model.get_twin_q_values(model_out_t, actions)
+        twin_q_t_selected = tf.squeeze(twin_q_t, axis=-1)
+    # Target q network evaluation.
+    q_tp1, _ = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1)
+    if twin_q:
+        twin_q_tp1, _ = policy.target_model.get_twin_q_values(
+            target_model_out_tp1, policy_tp1
+        )
+        # Take min over both twin-NNs.
+        q_tp1 = tf.math.minimum(q_tp1, twin_q_tp1)
+    q_tp1_best = tf.squeeze(input=q_tp1, axis=-1)
+    q_tp1_best_masked = (1.0 - tf.cast(terminals, tf.float32)) * q_tp1_best
+    # compute RHS of bellman equation
+    q_t_target = tf.stop_gradient(
+        rewards + (discount ** policy.config["n_step"]) * q_tp1_best_masked
+    )
+    # Compute the TD-error (potentially clipped), for priority replay buffer
+    base_td_error = tf.math.abs(q_t_selected - q_t_target)
+    if twin_q:
+        twin_td_error = tf.math.abs(twin_q_t_selected - q_t_target)
+        td_error = 0.5 * (base_td_error + twin_td_error)
+    else:
+        td_error = base_td_error
+    critic_loss_1 = tf.keras.losses.MSE(q_t_selected, q_t_target)
+    if twin_q:
+        critic_loss_2 = tf.keras.losses.MSE(twin_q_t_selected, q_t_target)
+    # CQL Loss (We are using Entropy version of CQL (the best version))
+    rand_actions, _ = policy._random_action_generator.get_exploration_action(
+        action_distribution=action_dist_class(
+            tf.tile(action_dist_tp1.inputs, (num_actions, 1)), model
+        ),
+        timestep=0,
+        explore=True,
+    )
+    curr_actions, curr_logp = policy_actions_repeat(
+        model, action_dist_class, model_out_t, num_actions
+    )
+    next_actions, next_logp = policy_actions_repeat(
+        model, action_dist_class, model_out_tp1, num_actions
+    )
+    q1_rand = q_values_repeat(model, model_out_t, rand_actions)
+    q1_curr_actions = q_values_repeat(model, model_out_t, curr_actions)
+    q1_next_actions = q_values_repeat(model, model_out_t, next_actions)
+    if twin_q:
+        q2_rand = q_values_repeat(model, model_out_t, rand_actions, twin=True)
+        q2_curr_actions = q_values_repeat(model, model_out_t, curr_actions, twin=True)
+        q2_next_actions = q_values_repeat(model, model_out_t, next_actions, twin=True)
+    random_density = np.log(0.5 ** int(curr_actions.shape[-1]))
+    cat_q1 = tf.concat(
+        [
+            q1_rand - random_density,
+            q1_next_actions - tf.stop_gradient(next_logp),
+            q1_curr_actions - tf.stop_gradient(curr_logp),
+        ],
+        1,
+    )
+    if twin_q:
+        cat_q2 = tf.concat(
+            [
+                q2_rand - random_density,
+                q2_next_actions - tf.stop_gradient(next_logp),
+                q2_curr_actions - tf.stop_gradient(curr_logp),
+            ],
+            1,
+        )
+    min_qf1_loss_ = (
+        tf.reduce_mean(tf.reduce_logsumexp(cat_q1 / cql_temp, axis=1))
+        * min_q_weight
+        * cql_temp
+    )
+    min_qf1_loss = min_qf1_loss_ - (tf.reduce_mean(q_t) * min_q_weight)
+    if twin_q:
+        min_qf2_loss_ = (
+            tf.reduce_mean(tf.reduce_logsumexp(cat_q2 / cql_temp, axis=1))
+            * min_q_weight
+            * cql_temp
+        )
+        min_qf2_loss = min_qf2_loss_ - (tf.reduce_mean(twin_q_t) * min_q_weight)
+    if use_lagrange:
+        alpha_prime = tf.clip_by_value(model.log_alpha_prime.exp(), 0.0, 1000000.0)[0]
+        min_qf1_loss = alpha_prime * (min_qf1_loss - target_action_gap)
+        if twin_q:
+            min_qf2_loss = alpha_prime * (min_qf2_loss - target_action_gap)
+            alpha_prime_loss = 0.5 * (-min_qf1_loss - min_qf2_loss)
+        else:
+            alpha_prime_loss = -min_qf1_loss
+    cql_loss = [min_qf1_loss]
+    if twin_q:
+        cql_loss.append(min_qf2_loss)
+    critic_loss = [critic_loss_1 + min_qf1_loss]
+    if twin_q:
+        critic_loss.append(critic_loss_2 + min_qf2_loss)
+    # Save for stats function.
+    policy.q_t = q_t_selected
+    policy.policy_t = policy_t
+    policy.log_pis_t = log_pis_t
+    policy.td_error = td_error
+    policy.actor_loss = actor_loss
+    policy.critic_loss = critic_loss
+    policy.alpha_loss = alpha_loss
+    policy.log_alpha_value = model.log_alpha
+    policy.alpha_value = alpha
+    policy.target_entropy = model.target_entropy
+    # CQL Stats
+    policy.cql_loss = cql_loss
+    if use_lagrange:
+        policy.log_alpha_prime_value = model.log_alpha_prime[0]
+        policy.alpha_prime_value = alpha_prime
+        policy.alpha_prime_loss = alpha_prime_loss
+    # Return all loss terms corresponding to our optimizers.
+    if use_lagrange:
+        return actor_loss + tf.math.add_n(critic_loss) + alpha_loss + alpha_prime_loss
+    return actor_loss + tf.math.add_n(critic_loss) + alpha_loss
+def cql_stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]:
+    sac_dict = stats(policy, train_batch)
+    sac_dict["cql_loss"] = tf.reduce_mean(tf.stack(policy.cql_loss))
+    if policy.config["lagrangian"]:
+        sac_dict["log_alpha_prime_value"] = policy.log_alpha_prime_value
+        sac_dict["alpha_prime_value"] = policy.alpha_prime_value
+        sac_dict["alpha_prime_loss"] = policy.alpha_prime_loss
+    return sac_dict
+class ActorCriticOptimizerMixin(SACActorCriticOptimizerMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        if config["lagrangian"]:
+            # Eager mode.
+            if config["framework"] == "tf2":
+                self._alpha_prime_optimizer = tf.keras.optimizers.Adam(
+                    learning_rate=config["optimization"]["critic_learning_rate"]
+                )
+            # Static graph mode.
+            else:
+                self._alpha_prime_optimizer = tf1.train.AdamOptimizer(
+                    learning_rate=config["optimization"]["critic_learning_rate"]
+                )
+def setup_early_mixins(
+    policy: Policy,
+    obs_space: gym.spaces.Space,
+    action_space: gym.spaces.Space,
+    config: AlgorithmConfigDict,
+) -> None:
+    """Call mixin classes' constructors before Policy's initialization.
+    Adds the necessary optimizers to the given Policy.
+    Args:
+        policy: The Policy object.
+        obs_space (gym.spaces.Space): The Policy's observation space.
+        action_space (gym.spaces.Space): The Policy's action space.
+        config: The Policy's config.
+    """
+    policy.cur_iter = 0
+    ActorCriticOptimizerMixin.__init__(policy, config)
+    if config["lagrangian"]:
+        policy.model.log_alpha_prime = get_variable(
+            0.0, framework="tf", trainable=True, tf_name="log_alpha_prime"
+        )
+        policy.alpha_prime_optim = tf.keras.optimizers.Adam(
+            learning_rate=config["optimization"]["critic_learning_rate"],
+        )
+    # Generic random action generator for calculating CQL-loss.
+    policy._random_action_generator = Random(
+        action_space,
+        model=None,
+        framework="tf2",
+        policy_config=config,
+        num_workers=0,
+        worker_index=0,
+    )
+def compute_gradients_fn(
+    policy: Policy, optimizer: LocalOptimizer, loss: TensorType
+) -> ModelGradients:
+    grads_and_vars = sac_compute_and_clip_gradients(policy, optimizer, loss)
+    if policy.config["lagrangian"]:
+        # Eager: Use GradientTape (which is a property of the `optimizer`
+        # object (an OptimizerWrapper): see rllib/policy/eager_tf_policy.py).
+        if policy.config["framework"] == "tf2":
+            tape = optimizer.tape
+            log_alpha_prime = [policy.model.log_alpha_prime]
+            alpha_prime_grads_and_vars = list(
+                zip(
+                    tape.gradient(policy.alpha_prime_loss, log_alpha_prime),
+                    log_alpha_prime,
+                )
+            )
+        # Tf1.x: Use optimizer.compute_gradients()
+        else:
+            alpha_prime_grads_and_vars = (
+                policy._alpha_prime_optimizer.compute_gradients(
+                    policy.alpha_prime_loss, var_list=[policy.model.log_alpha_prime]
+                )
+            )
+        # Clip if necessary.
+        if policy.config["grad_clip"]:
+            clip_func = partial(tf.clip_by_norm, clip_norm=policy.config["grad_clip"])
+        else:
+            clip_func = tf.identity
+        # Save grads and vars for later use in `build_apply_op`.
+        policy._alpha_prime_grads_and_vars = [
+            (clip_func(g), v) for (g, v) in alpha_prime_grads_and_vars if g is not None
+        ]
+        grads_and_vars += policy._alpha_prime_grads_and_vars
+    return grads_and_vars
+def apply_gradients_fn(policy, optimizer, grads_and_vars):
+    sac_results = sac_apply_gradients(policy, optimizer, grads_and_vars)
+    if policy.config["lagrangian"]:
+        # Eager mode -> Just apply and return None.
+        if policy.config["framework"] == "tf2":
+            policy._alpha_prime_optimizer.apply_gradients(
+                policy._alpha_prime_grads_and_vars
+            )
+            return
+        # Tf static graph -> Return grouped op.
+        else:
+            alpha_prime_apply_op = policy._alpha_prime_optimizer.apply_gradients(
+                policy._alpha_prime_grads_and_vars,
+                global_step=tf1.train.get_or_create_global_step(),
+            )
+            return tf.group([sac_results, alpha_prime_apply_op])
+    return sac_results
+# Build a child class of `TFPolicy`, given the custom functions defined
+# above.
+CQLTFPolicy = build_tf_policy(
+    name="CQLTFPolicy",
+    loss_fn=cql_loss,
+    get_default_config=lambda: ray.rllib.algorithms.cql.cql.CQLConfig(),
+    validate_spaces=validate_spaces,
+    stats_fn=cql_stats,
+    postprocess_fn=postprocess_trajectory,
+    before_init=setup_early_mixins,
+    after_init=setup_late_mixins,
+    make_model=build_sac_model,
+    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
+    mixins=[ActorCriticOptimizerMixin, TargetNetworkMixin, ComputeTDErrorMixin],
+    action_distribution_fn=get_distribution_inputs_and_class,
+    compute_gradients_fn=compute_gradients_fn,
+    apply_gradients_fn=apply_gradients_fn,
+)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/cql_torch_policy.py ADDED Viewed

	@@ -0,0 +1,406 @@

+"""
+PyTorch policy class used for CQL.
+"""
+import numpy as np
+import gymnasium as gym
+import logging
+import tree
+from typing import Dict, List, Tuple, Type, Union
+import ray
+import ray.experimental.tf_utils
+from ray.rllib.algorithms.sac.sac_tf_policy import (
+    postprocess_trajectory,
+    validate_spaces,
+)
+from ray.rllib.algorithms.sac.sac_torch_policy import (
+    _get_dist_class,
+    stats,
+    build_sac_model_and_action_dist,
+    optimizer_fn,
+    ComputeTDErrorMixin,
+    setup_late_mixins,
+    action_distribution_fn,
+)
+from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.policy.policy_template import build_policy_class
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.torch_mixins import TargetNetworkMixin
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY
+from ray.rllib.utils.typing import LocalOptimizer, TensorType, AlgorithmConfigDict
+from ray.rllib.utils.torch_utils import (
+    apply_grad_clipping,
+    convert_to_torch_tensor,
+    concat_multi_gpu_td_errors,
+)
+torch, nn = try_import_torch()
+F = nn.functional
+logger = logging.getLogger(__name__)
+MEAN_MIN = -9.0
+MEAN_MAX = 9.0
+def _repeat_tensor(t: TensorType, n: int):
+    # Insert new dimension at posotion 1 into tensor t
+    t_rep = t.unsqueeze(1)
+    # Repeat tensor t_rep along new dimension n times
+    t_rep = torch.repeat_interleave(t_rep, n, dim=1)
+    # Merge new dimension into batch dimension
+    t_rep = t_rep.view(-1, *t.shape[1:])
+    return t_rep
+# Returns policy tiled actions and log probabilities for CQL Loss
+def policy_actions_repeat(model, action_dist, obs, num_repeat=1):
+    batch_size = tree.flatten(obs)[0].shape[0]
+    obs_temp = tree.map_structure(lambda t: _repeat_tensor(t, num_repeat), obs)
+    logits, _ = model.get_action_model_outputs(obs_temp)
+    policy_dist = action_dist(logits, model)
+    actions, logp_ = policy_dist.sample_logp()
+    logp = logp_.unsqueeze(-1)
+    return actions, logp.view(batch_size, num_repeat, 1)
+def q_values_repeat(model, obs, actions, twin=False):
+    action_shape = actions.shape[0]
+    obs_shape = tree.flatten(obs)[0].shape[0]
+    num_repeat = int(action_shape / obs_shape)
+    obs_temp = tree.map_structure(lambda t: _repeat_tensor(t, num_repeat), obs)
+    if not twin:
+        preds_, _ = model.get_q_values(obs_temp, actions)
+    else:
+        preds_, _ = model.get_twin_q_values(obs_temp, actions)
+    preds = preds_.view(obs_shape, num_repeat, 1)
+    return preds
+def cql_loss(
+    policy: Policy,
+    model: ModelV2,
+    dist_class: Type[TorchDistributionWrapper],
+    train_batch: SampleBatch,
+) -> Union[TensorType, List[TensorType]]:
+    logger.info(f"Current iteration = {policy.cur_iter}")
+    policy.cur_iter += 1
+    # Look up the target model (tower) using the model tower.
+    target_model = policy.target_models[model]
+    # For best performance, turn deterministic off
+    deterministic = policy.config["_deterministic_loss"]
+    assert not deterministic
+    twin_q = policy.config["twin_q"]
+    discount = policy.config["gamma"]
+    action_low = model.action_space.low[0]
+    action_high = model.action_space.high[0]
+    # CQL Parameters
+    bc_iters = policy.config["bc_iters"]
+    cql_temp = policy.config["temperature"]
+    num_actions = policy.config["num_actions"]
+    min_q_weight = policy.config["min_q_weight"]
+    use_lagrange = policy.config["lagrangian"]
+    target_action_gap = policy.config["lagrangian_thresh"]
+    obs = train_batch[SampleBatch.CUR_OBS]
+    actions = train_batch[SampleBatch.ACTIONS]
+    rewards = train_batch[SampleBatch.REWARDS].float()
+    next_obs = train_batch[SampleBatch.NEXT_OBS]
+    terminals = train_batch[SampleBatch.TERMINATEDS]
+    model_out_t, _ = model(SampleBatch(obs=obs, _is_training=True), [], None)
+    model_out_tp1, _ = model(SampleBatch(obs=next_obs, _is_training=True), [], None)
+    target_model_out_tp1, _ = target_model(
+        SampleBatch(obs=next_obs, _is_training=True), [], None
+    )
+    action_dist_class = _get_dist_class(policy, policy.config, policy.action_space)
+    action_dist_inputs_t, _ = model.get_action_model_outputs(model_out_t)
+    action_dist_t = action_dist_class(action_dist_inputs_t, model)
+    policy_t, log_pis_t = action_dist_t.sample_logp()
+    log_pis_t = torch.unsqueeze(log_pis_t, -1)
+    # Unlike original SAC, Alpha and Actor Loss are computed first.
+    # Alpha Loss
+    alpha_loss = -(model.log_alpha * (log_pis_t + model.target_entropy).detach()).mean()
+    batch_size = tree.flatten(obs)[0].shape[0]
+    if batch_size == policy.config["train_batch_size"]:
+        policy.alpha_optim.zero_grad()
+        alpha_loss.backward()
+        policy.alpha_optim.step()
+    # Policy Loss (Either Behavior Clone Loss or SAC Loss)
+    alpha = torch.exp(model.log_alpha)
+    if policy.cur_iter >= bc_iters:
+        min_q, _ = model.get_q_values(model_out_t, policy_t)
+        if twin_q:
+            twin_q_, _ = model.get_twin_q_values(model_out_t, policy_t)
+            min_q = torch.min(min_q, twin_q_)
+        actor_loss = (alpha.detach() * log_pis_t - min_q).mean()
+    else:
+        bc_logp = action_dist_t.logp(actions)
+        actor_loss = (alpha.detach() * log_pis_t - bc_logp).mean()
+        # actor_loss = -bc_logp.mean()
+    if batch_size == policy.config["train_batch_size"]:
+        policy.actor_optim.zero_grad()
+        actor_loss.backward(retain_graph=True)
+        policy.actor_optim.step()
+    # Critic Loss (Standard SAC Critic L2 Loss + CQL Entropy Loss)
+    # SAC Loss:
+    # Q-values for the batched actions.
+    action_dist_inputs_tp1, _ = model.get_action_model_outputs(model_out_tp1)
+    action_dist_tp1 = action_dist_class(action_dist_inputs_tp1, model)
+    policy_tp1, _ = action_dist_tp1.sample_logp()
+    q_t, _ = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])
+    q_t_selected = torch.squeeze(q_t, dim=-1)
+    if twin_q:
+        twin_q_t, _ = model.get_twin_q_values(
+            model_out_t, train_batch[SampleBatch.ACTIONS]
+        )
+        twin_q_t_selected = torch.squeeze(twin_q_t, dim=-1)
+    # Target q network evaluation.
+    q_tp1, _ = target_model.get_q_values(target_model_out_tp1, policy_tp1)
+    if twin_q:
+        twin_q_tp1, _ = target_model.get_twin_q_values(target_model_out_tp1, policy_tp1)
+        # Take min over both twin-NNs.
+        q_tp1 = torch.min(q_tp1, twin_q_tp1)
+    q_tp1_best = torch.squeeze(input=q_tp1, dim=-1)
+    q_tp1_best_masked = (1.0 - terminals.float()) * q_tp1_best
+    # compute RHS of bellman equation
+    q_t_target = (
+        rewards + (discount ** policy.config["n_step"]) * q_tp1_best_masked
+    ).detach()
+    # Compute the TD-error (potentially clipped), for priority replay buffer
+    base_td_error = torch.abs(q_t_selected - q_t_target)
+    if twin_q:
+        twin_td_error = torch.abs(twin_q_t_selected - q_t_target)
+        td_error = 0.5 * (base_td_error + twin_td_error)
+    else:
+        td_error = base_td_error
+    critic_loss_1 = nn.functional.mse_loss(q_t_selected, q_t_target)
+    if twin_q:
+        critic_loss_2 = nn.functional.mse_loss(twin_q_t_selected, q_t_target)
+    # CQL Loss (We are using Entropy version of CQL (the best version))
+    rand_actions = convert_to_torch_tensor(
+        torch.FloatTensor(actions.shape[0] * num_actions, actions.shape[-1]).uniform_(
+            action_low, action_high
+        ),
+        policy.device,
+    )
+    curr_actions, curr_logp = policy_actions_repeat(
+        model, action_dist_class, model_out_t, num_actions
+    )
+    next_actions, next_logp = policy_actions_repeat(
+        model, action_dist_class, model_out_tp1, num_actions
+    )
+    q1_rand = q_values_repeat(model, model_out_t, rand_actions)
+    q1_curr_actions = q_values_repeat(model, model_out_t, curr_actions)
+    q1_next_actions = q_values_repeat(model, model_out_t, next_actions)
+    if twin_q:
+        q2_rand = q_values_repeat(model, model_out_t, rand_actions, twin=True)
+        q2_curr_actions = q_values_repeat(model, model_out_t, curr_actions, twin=True)
+        q2_next_actions = q_values_repeat(model, model_out_t, next_actions, twin=True)
+    random_density = np.log(0.5 ** curr_actions.shape[-1])
+    cat_q1 = torch.cat(
+        [
+            q1_rand - random_density,
+            q1_next_actions - next_logp.detach(),
+            q1_curr_actions - curr_logp.detach(),
+        ],
+        1,
+    )
+    if twin_q:
+        cat_q2 = torch.cat(
+            [
+                q2_rand - random_density,
+                q2_next_actions - next_logp.detach(),
+                q2_curr_actions - curr_logp.detach(),
+            ],
+            1,
+        )
+    min_qf1_loss_ = (
+        torch.logsumexp(cat_q1 / cql_temp, dim=1).mean() * min_q_weight * cql_temp
+    )
+    min_qf1_loss = min_qf1_loss_ - (q_t.mean() * min_q_weight)
+    if twin_q:
+        min_qf2_loss_ = (
+            torch.logsumexp(cat_q2 / cql_temp, dim=1).mean() * min_q_weight * cql_temp
+        )
+        min_qf2_loss = min_qf2_loss_ - (twin_q_t.mean() * min_q_weight)
+    if use_lagrange:
+        alpha_prime = torch.clamp(model.log_alpha_prime.exp(), min=0.0, max=1000000.0)[
+            0
+        ]
+        min_qf1_loss = alpha_prime * (min_qf1_loss - target_action_gap)
+        if twin_q:
+            min_qf2_loss = alpha_prime * (min_qf2_loss - target_action_gap)
+            alpha_prime_loss = 0.5 * (-min_qf1_loss - min_qf2_loss)
+        else:
+            alpha_prime_loss = -min_qf1_loss
+    cql_loss = [min_qf1_loss]
+    if twin_q:
+        cql_loss.append(min_qf2_loss)
+    critic_loss = [critic_loss_1 + min_qf1_loss]
+    if twin_q:
+        critic_loss.append(critic_loss_2 + min_qf2_loss)
+    if batch_size == policy.config["train_batch_size"]:
+        policy.critic_optims[0].zero_grad()
+        critic_loss[0].backward(retain_graph=True)
+        policy.critic_optims[0].step()
+        if twin_q:
+            policy.critic_optims[1].zero_grad()
+            critic_loss[1].backward(retain_graph=False)
+            policy.critic_optims[1].step()
+    # Store values for stats function in model (tower), such that for
+    # multi-GPU, we do not override them during the parallel loss phase.
+    # SAC stats.
+    model.tower_stats["q_t"] = q_t_selected
+    model.tower_stats["policy_t"] = policy_t
+    model.tower_stats["log_pis_t"] = log_pis_t
+    model.tower_stats["actor_loss"] = actor_loss
+    model.tower_stats["critic_loss"] = critic_loss
+    model.tower_stats["alpha_loss"] = alpha_loss
+    model.tower_stats["log_alpha_value"] = model.log_alpha
+    model.tower_stats["alpha_value"] = alpha
+    model.tower_stats["target_entropy"] = model.target_entropy
+    # CQL stats.
+    model.tower_stats["cql_loss"] = cql_loss
+    # TD-error tensor in final stats
+    # will be concatenated and retrieved for each individual batch item.
+    model.tower_stats["td_error"] = td_error
+    if use_lagrange:
+        model.tower_stats["log_alpha_prime_value"] = model.log_alpha_prime[0]
+        model.tower_stats["alpha_prime_value"] = alpha_prime
+        model.tower_stats["alpha_prime_loss"] = alpha_prime_loss
+        if batch_size == policy.config["train_batch_size"]:
+            policy.alpha_prime_optim.zero_grad()
+            alpha_prime_loss.backward()
+            policy.alpha_prime_optim.step()
+    # Return all loss terms corresponding to our optimizers.
+    return tuple(
+        [actor_loss]
+        + critic_loss
+        + [alpha_loss]
+        + ([alpha_prime_loss] if use_lagrange else [])
+    )
+def cql_stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]:
+    # Get SAC loss stats.
+    stats_dict = stats(policy, train_batch)
+    # Add CQL loss stats to the dict.
+    stats_dict["cql_loss"] = torch.mean(
+        torch.stack(*policy.get_tower_stats("cql_loss"))
+    )
+    if policy.config["lagrangian"]:
+        stats_dict["log_alpha_prime_value"] = torch.mean(
+            torch.stack(policy.get_tower_stats("log_alpha_prime_value"))
+        )
+        stats_dict["alpha_prime_value"] = torch.mean(
+            torch.stack(policy.get_tower_stats("alpha_prime_value"))
+        )
+        stats_dict["alpha_prime_loss"] = torch.mean(
+            torch.stack(policy.get_tower_stats("alpha_prime_loss"))
+        )
+    return stats_dict
+def cql_optimizer_fn(
+    policy: Policy, config: AlgorithmConfigDict
+) -> Tuple[LocalOptimizer]:
+    policy.cur_iter = 0
+    opt_list = optimizer_fn(policy, config)
+    if config["lagrangian"]:
+        log_alpha_prime = nn.Parameter(torch.zeros(1, requires_grad=True).float())
+        policy.model.register_parameter("log_alpha_prime", log_alpha_prime)
+        policy.alpha_prime_optim = torch.optim.Adam(
+            params=[policy.model.log_alpha_prime],
+            lr=config["optimization"]["critic_learning_rate"],
+            eps=1e-7,  # to match tf.keras.optimizers.Adam's epsilon default
+        )
+        return tuple(
+            [policy.actor_optim]
+            + policy.critic_optims
+            + [policy.alpha_optim]
+            + [policy.alpha_prime_optim]
+        )
+    return opt_list
+def cql_setup_late_mixins(
+    policy: Policy,
+    obs_space: gym.spaces.Space,
+    action_space: gym.spaces.Space,
+    config: AlgorithmConfigDict,
+) -> None:
+    setup_late_mixins(policy, obs_space, action_space, config)
+    if config["lagrangian"]:
+        policy.model.log_alpha_prime = policy.model.log_alpha_prime.to(policy.device)
+def compute_gradients_fn(policy, postprocessed_batch):
+    batches = [policy._lazy_tensor_dict(postprocessed_batch)]
+    model = policy.model
+    policy._loss(policy, model, policy.dist_class, batches[0])
+    stats = {LEARNER_STATS_KEY: policy._convert_to_numpy(cql_stats(policy, batches[0]))}
+    return [None, stats]
+def apply_gradients_fn(policy, gradients):
+    return
+# Build a child class of `TorchPolicy`, given the custom functions defined
+# above.
+CQLTorchPolicy = build_policy_class(
+    name="CQLTorchPolicy",
+    framework="torch",
+    loss_fn=cql_loss,
+    get_default_config=lambda: ray.rllib.algorithms.cql.cql.CQLConfig(),
+    stats_fn=cql_stats,
+    postprocess_fn=postprocess_trajectory,
+    extra_grad_process_fn=apply_grad_clipping,
+    optimizer_fn=cql_optimizer_fn,
+    validate_spaces=validate_spaces,
+    before_loss_init=cql_setup_late_mixins,
+    make_model_and_action_dist=build_sac_model_and_action_dist,
+    extra_learn_fetches_fn=concat_multi_gpu_td_errors,
+    mixins=[TargetNetworkMixin, ComputeTDErrorMixin],
+    action_distribution_fn=action_distribution_fn,
+    compute_gradients_fn=compute_gradients_fn,
+    apply_gradients_fn=apply_gradients_fn,
+)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (203 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/__pycache__/cql_torch_learner.cpython-311.pyc ADDED Viewed

Binary file (9.88 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/__pycache__/default_cql_torch_rl_module.cpython-311.pyc ADDED Viewed

Binary file (8.66 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/cql_torch_learner.py ADDED Viewed

	@@ -0,0 +1,275 @@

+from typing import Dict
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.sac.sac_learner import (
+    LOGPS_KEY,
+    QF_LOSS_KEY,
+    QF_MEAN_KEY,
+    QF_MAX_KEY,
+    QF_MIN_KEY,
+    QF_PREDS,
+    QF_TWIN_LOSS_KEY,
+    QF_TWIN_PREDS,
+    TD_ERROR_MEAN_KEY,
+)
+from ray.rllib.algorithms.cql.cql import CQLConfig
+from ray.rllib.algorithms.sac.torch.sac_torch_learner import SACTorchLearner
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.learner import (
+    POLICY_LOSS_KEY,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.metrics import ALL_MODULES
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModuleID, ParamDict, TensorType
+torch, nn = try_import_torch()
+class CQLTorchLearner(SACTorchLearner):
+    @override(SACTorchLearner)
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: CQLConfig,
+        batch: Dict,
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        # TODO (simon, sven): Add upstream information pieces into this timesteps
+        #  call arg to Learner.update_...().
+        self.metrics.log_value(
+            (ALL_MODULES, TRAINING_ITERATION),
+            1,
+            reduce="sum",
+        )
+        # Get the train action distribution for the current policy and current state.
+        # This is needed for the policy (actor) loss and the `alpha`` loss.
+        action_dist_class = self.module[module_id].get_train_action_dist_cls()
+        action_dist_curr = action_dist_class.from_logits(
+            fwd_out[Columns.ACTION_DIST_INPUTS]
+        )
+        # Optimize also the hyperparameter `alpha` by using the current policy
+        # evaluated at the current state (from offline data). Note, in contrast
+        # to the original SAC loss, here the `alpha` and actor losses are
+        # calculated first.
+        # TODO (simon): Check, why log(alpha) is used, prob. just better
+        # to optimize and monotonic function. Original equation uses alpha.
+        alpha_loss = -torch.mean(
+            self.curr_log_alpha[module_id]
+            * (fwd_out["logp_resampled"].detach() + self.target_entropy[module_id])
+        )
+        # Get the current alpha.
+        alpha = torch.exp(self.curr_log_alpha[module_id])
+        # Start training with behavior cloning and turn to the classic Soft-Actor Critic
+        # after `bc_iters` of training iterations.
+        if (
+            self.metrics.peek((ALL_MODULES, TRAINING_ITERATION), default=0)
+            >= config.bc_iters
+        ):
+            actor_loss = torch.mean(
+                alpha.detach() * fwd_out["logp_resampled"] - fwd_out["q_curr"]
+            )
+        else:
+            # Use log-probabilities of the current action distribution to clone
+            # the behavior policy (selected actions in data) in the first `bc_iters`
+            # training iterations.
+            bc_logps_curr = action_dist_curr.logp(batch[Columns.ACTIONS])
+            actor_loss = torch.mean(
+                alpha.detach() * fwd_out["logp_resampled"] - bc_logps_curr
+            )
+        # The critic loss is composed of the standard SAC Critic L2 loss and the
+        # CQL entropy loss.
+        # Get the Q-values for the actually selected actions in the offline data.
+        # In the critic loss we use these as predictions.
+        q_selected = fwd_out[QF_PREDS]
+        if config.twin_q:
+            q_twin_selected = fwd_out[QF_TWIN_PREDS]
+        if not config.deterministic_backup:
+            q_next = (
+                fwd_out["q_target_next"]
+                - alpha.detach() * fwd_out["logp_next_resampled"]
+            )
+        else:
+            q_next = fwd_out["q_target_next"]
+        # Now mask all Q-values with terminating next states in the targets.
+        q_next_masked = (1.0 - batch[Columns.TERMINATEDS].float()) * q_next
+        # Compute the right hand side of the Bellman equation. Detach this node
+        # from the computation graph as we do not want to backpropagate through
+        # the target network when optimizing the Q loss.
+        # TODO (simon, sven): Kumar et al. (2020) use here also a reward scaler.
+        q_selected_target = (
+            # TODO (simon): Add an `n_step` option to the `AddNextObsToBatch` connector.
+            batch[Columns.REWARDS]
+            # TODO (simon): Implement n_step.
+            + (config.gamma) * q_next_masked
+        ).detach()
+        # Calculate the TD error.
+        td_error = torch.abs(q_selected - q_selected_target)
+        # Calculate a TD-error for twin-Q values, if needed.
+        if config.twin_q:
+            td_error += torch.abs(q_twin_selected - q_selected_target)
+            # Rescale the TD error
+            td_error *= 0.5
+        # MSBE loss for the critic(s) (i.e. Q, see eqs. (7-8) Haarnoja et al. (2018)).
+        # Note, this needs a sample from the current policy given the next state.
+        # Note further, we could also use here the Huber loss instead of the MSE.
+        # TODO (simon): Add the huber loss as an alternative (SAC uses it).
+        sac_critic_loss = torch.nn.MSELoss(reduction="mean")(
+            q_selected,
+            q_selected_target,
+        )
+        if config.twin_q:
+            sac_critic_twin_loss = torch.nn.MSELoss(reduction="mean")(
+                q_twin_selected,
+                q_selected_target,
+            )
+        # Now calculate the CQL loss (we use the entropy version of the CQL algorithm).
+        # Note, the entropy version performs best in shown experiments.
+        # Compute the log-probabilities for the random actions (note, we generate random
+        # actions (from the mu distribution as named in Kumar et al. (2020))).
+        # Note, all actions, action log-probabilities and Q-values are already computed
+        # by the module's `_forward_train` method.
+        # TODO (simon): This is the density for a discrete uniform, however, actions
+        # come from a continuous one. So actually this density should use (1/(high-low))
+        # instead of (1/2).
+        random_density = torch.log(
+            torch.pow(
+                0.5,
+                torch.tensor(
+                    fwd_out["actions_curr_repeat"].shape[-1],
+                    device=fwd_out["actions_curr_repeat"].device,
+                ),
+            )
+        )
+        # Merge all Q-values and subtract the log-probabilities (note, we use the
+        # entropy version of CQL).
+        q_repeat = torch.cat(
+            [
+                fwd_out["q_rand_repeat"] - random_density,
+                fwd_out["q_next_repeat"] - fwd_out["logps_next_repeat"].detach(),
+                fwd_out["q_curr_repeat"] - fwd_out["logps_curr_repeat"].detach(),
+            ],
+            dim=1,
+        )
+        cql_loss = (
+            torch.logsumexp(q_repeat / config.temperature, dim=1).mean()
+            * config.min_q_weight
+            * config.temperature
+        )
+        cql_loss -= q_selected.mean() * config.min_q_weight
+        # Add the CQL loss term to the SAC loss term.
+        critic_loss = sac_critic_loss + cql_loss
+        # If a twin Q-value function is implemented calculated its CQL loss.
+        if config.twin_q:
+            q_twin_repeat = torch.cat(
+                [
+                    fwd_out["q_twin_rand_repeat"] - random_density,
+                    fwd_out["q_twin_next_repeat"]
+                    - fwd_out["logps_next_repeat"].detach(),
+                    fwd_out["q_twin_curr_repeat"]
+                    - fwd_out["logps_curr_repeat"].detach(),
+                ],
+                dim=1,
+            )
+            cql_twin_loss = (
+                torch.logsumexp(q_twin_repeat / config.temperature, dim=1).mean()
+                * config.min_q_weight
+                * config.temperature
+            )
+            cql_twin_loss -= q_twin_selected.mean() * config.min_q_weight
+            # Add the CQL loss term to the SAC loss term.
+            critic_twin_loss = sac_critic_twin_loss + cql_twin_loss
+        # TODO (simon): Check, if we need to implement here also a Lagrangian
+        # loss.
+        total_loss = actor_loss + critic_loss + alpha_loss
+        # Add the twin critic loss to the total loss, if needed.
+        if config.twin_q:
+            # Reweigh the critic loss terms in the total loss.
+            total_loss += 0.5 * critic_twin_loss - 0.5 * critic_loss
+        # Log important loss stats (reduce=mean (default), but with window=1
+        # in order to keep them history free).
+        self.metrics.log_dict(
+            {
+                POLICY_LOSS_KEY: actor_loss,
+                QF_LOSS_KEY: critic_loss,
+                # TODO (simon): Add these keys to SAC Learner.
+                "cql_loss": cql_loss,
+                "alpha_loss": alpha_loss,
+                "alpha_value": alpha,
+                "log_alpha_value": torch.log(alpha),
+                "target_entropy": self.target_entropy[module_id],
+                LOGPS_KEY: torch.mean(
+                    fwd_out["logp_resampled"]
+                ),  # torch.mean(logps_curr),
+                QF_MEAN_KEY: torch.mean(fwd_out["q_curr_repeat"]),
+                QF_MAX_KEY: torch.max(fwd_out["q_curr_repeat"]),
+                QF_MIN_KEY: torch.min(fwd_out["q_curr_repeat"]),
+                TD_ERROR_MEAN_KEY: torch.mean(td_error),
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        # TODO (simon): Add loss keys for langrangian, if needed.
+        # TODO (simon): Add only here then the Langrange parameter optimization.
+        if config.twin_q:
+            self.metrics.log_dict(
+                {
+                    QF_TWIN_LOSS_KEY: critic_twin_loss,
+                },
+                key=module_id,
+                window=1,  # <- single items (should not be mean/ema-reduced over time).
+            )
+        # Return the total loss.
+        return total_loss
+    @override(SACTorchLearner)
+    def compute_gradients(
+        self, loss_per_module: Dict[ModuleID, TensorType], **kwargs
+    ) -> ParamDict:
+        grads = {}
+        for module_id in set(loss_per_module.keys()) - {ALL_MODULES}:
+            # Loop through optimizers registered for this module.
+            for optim_name, optim in self.get_optimizers_for_module(module_id):
+                # Zero the gradients. Note, we need to reset the gradients b/c
+                # each component for a module operates on the same graph.
+                optim.zero_grad(set_to_none=True)
+                # Compute the gradients for the component and module.
+                self.metrics.peek((module_id, optim_name + "_loss")).backward(
+                    retain_graph=False if optim_name in ["policy", "alpha"] else True
+                )
+                # Store the gradients for the component and module.
+                # TODO (simon): Check another time the graph for overlapping
+                # gradients.
+                grads.update(
+                    {
+                        pid: grads[pid] + p.grad.clone()
+                        if pid in grads
+                        else p.grad.clone()
+                        for pid, p in self.filter_param_dict_for_optimizer(
+                            self._params, optim
+                        ).items()
+                    }
+                )
+        return grads

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/cql/torch/default_cql_torch_rl_module.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import tree
+from typing import Any, Dict, Optional
+from ray.rllib.algorithms.sac.sac_learner import (
+    QF_PREDS,
+    QF_TWIN_PREDS,
+)
+from ray.rllib.algorithms.sac.sac_catalog import SACCatalog
+from ray.rllib.algorithms.sac.torch.default_sac_torch_rl_module import (
+    DefaultSACTorchRLModule,
+)
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.models.base import ENCODER_OUT
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import TensorType
+torch, nn = try_import_torch()
+class DefaultCQLTorchRLModule(DefaultSACTorchRLModule):
+    def __init__(self, *args, **kwargs):
+        catalog_class = kwargs.pop("catalog_class", None)
+        if catalog_class is None:
+            catalog_class = SACCatalog
+        super().__init__(*args, **kwargs, catalog_class=catalog_class)
+    @override(DefaultSACTorchRLModule)
+    def _forward_train(self, batch: Dict) -> Dict[str, Any]:
+        # Call the super method.
+        fwd_out = super()._forward_train(batch)
+        # Make sure we perform a "straight-through gradient" pass here,
+        # ignoring the gradients of the q-net, however, still recording
+        # the gradients of the policy net (which was used to rsample the actions used
+        # here). This is different from doing `.detach()` or `with torch.no_grads()`,
+        # as these two methds would fully block all gradient recordings, including
+        # the needed policy ones.
+        all_params = list(self.pi_encoder.parameters()) + list(self.pi.parameters())
+        # if self.twin_q:
+        #     all_params += list(self.qf_twin.parameters()) + list(
+        #         self.qf_twin_encoder.parameters()
+        #     )
+        for param in all_params:
+            param.requires_grad = False
+        # Compute the repeated actions, action log-probabilites and Q-values for all
+        # observations.
+        # First for the random actions (from the mu-distribution as named by Kumar et
+        # al. (2020)).
+        low = torch.tensor(
+            self.action_space.low,
+            device=fwd_out[QF_PREDS].device,
+        )
+        high = torch.tensor(
+            self.action_space.high,
+            device=fwd_out[QF_PREDS].device,
+        )
+        num_samples = batch[Columns.ACTIONS].shape[0] * self.model_config["num_actions"]
+        actions_rand_repeat = low + (high - low) * torch.rand(
+            (num_samples, low.shape[0]), device=fwd_out[QF_PREDS].device
+        )
+        # First for the random actions (from the mu-distribution as named in Kumar
+        # et al. (2020)) using repeated observations.
+        rand_repeat_out = self._repeat_actions(batch[Columns.OBS], actions_rand_repeat)
+        (fwd_out["actions_rand_repeat"], fwd_out["q_rand_repeat"]) = (
+            rand_repeat_out[Columns.ACTIONS],
+            rand_repeat_out[QF_PREDS],
+        )
+        # Sample current and next actions (from the pi distribution as named in Kumar
+        # et al. (2020)) using repeated observations
+        # Second for the current observations and the current action distribution.
+        curr_repeat_out = self._repeat_actions(batch[Columns.OBS])
+        (
+            fwd_out["actions_curr_repeat"],
+            fwd_out["logps_curr_repeat"],
+            fwd_out["q_curr_repeat"],
+        ) = (
+            curr_repeat_out[Columns.ACTIONS],
+            curr_repeat_out[Columns.ACTION_LOGP],
+            curr_repeat_out[QF_PREDS],
+        )
+        # Then, for the next observations and the current action distribution.
+        next_repeat_out = self._repeat_actions(batch[Columns.NEXT_OBS])
+        (
+            fwd_out["actions_next_repeat"],
+            fwd_out["logps_next_repeat"],
+            fwd_out["q_next_repeat"],
+        ) = (
+            next_repeat_out[Columns.ACTIONS],
+            next_repeat_out[Columns.ACTION_LOGP],
+            next_repeat_out[QF_PREDS],
+        )
+        if self.twin_q:
+            # First for the random actions from the mu-distribution.
+            fwd_out["q_twin_rand_repeat"] = rand_repeat_out[QF_TWIN_PREDS]
+            # Second for the current observations and the current action distribution.
+            fwd_out["q_twin_curr_repeat"] = curr_repeat_out[QF_TWIN_PREDS]
+            # Then, for the next observations and the current action distribution.
+            fwd_out["q_twin_next_repeat"] = next_repeat_out[QF_TWIN_PREDS]
+        # Reset the gradient requirements for all Q-function parameters.
+        for param in all_params:
+            param.requires_grad = True
+        return fwd_out
+    def _repeat_tensor(self, tensor: TensorType, repeat: int) -> TensorType:
+        """Generates a repeated version of a tensor.
+        The repetition is done similar `np.repeat` and repeats each value
+        instead of the complete vector.
+        Args:
+            tensor: The tensor to be repeated.
+            repeat: How often each value in the tensor should be repeated.
+        Returns:
+            A tensor holding `repeat`  repeated values of the input `tensor`
+        """
+        # Insert the new dimension at axis 1 into the tensor.
+        t_repeat = tensor.unsqueeze(1)
+        # Repeat the tensor along the new dimension.
+        t_repeat = torch.repeat_interleave(t_repeat, repeat, dim=1)
+        # Stack the repeated values into the batch dimension.
+        t_repeat = t_repeat.view(-1, *tensor.shape[1:])
+        # Return the repeated tensor.
+        return t_repeat
+    def _repeat_actions(
+        self, obs: TensorType, actions: Optional[TensorType] = None
+    ) -> Dict[str, TensorType]:
+        """Generated actions and Q-values for repeated observations.
+        The `self.model_config["num_actions"]` define a multiplier
+        used for generating `num_actions` as many actions as the batch size.
+        Observations are repeated and then a model forward pass is made.
+        Args:
+            obs: A batched observation tensor.
+            actions: An optional batched actions tensor.
+        Returns:
+            A dictionary holding the (sampled or passed-in actions), the log
+            probabilities (of sampled actions), the Q-values and if available
+            the twin-Q values.
+        """
+        output = {}
+        # Receive the batch size.
+        batch_size = obs.shape[0]
+        # Receive the number of action to sample.
+        num_actions = self.model_config["num_actions"]
+        # Repeat the observations `num_actions` times.
+        obs_repeat = tree.map_structure(
+            lambda t: self._repeat_tensor(t, num_actions), obs
+        )
+        # Generate a batch for the forward pass.
+        temp_batch = {Columns.OBS: obs_repeat}
+        if actions is None:
+            # TODO (simon): Run the forward pass in inference mode.
+            # Compute the action logits.
+            pi_encoder_outs = self.pi_encoder(temp_batch)
+            action_logits = self.pi(pi_encoder_outs[ENCODER_OUT])
+            # Generate the squashed Gaussian from the model's logits.
+            action_dist = self.get_train_action_dist_cls().from_logits(action_logits)
+            # Sample the actions. Note, we want to make a backward pass through
+            # these actions.
+            output[Columns.ACTIONS] = action_dist.rsample()
+            # Compute the action log-probabilities.
+            output[Columns.ACTION_LOGP] = action_dist.logp(
+                output[Columns.ACTIONS]
+            ).view(batch_size, num_actions, 1)
+        else:
+            output[Columns.ACTIONS] = actions
+        # Compute all Q-values.
+        temp_batch.update(
+            {
+                Columns.ACTIONS: output[Columns.ACTIONS],
+            }
+        )
+        output.update(
+            {
+                QF_PREDS: self._qf_forward_train_helper(
+                    temp_batch,
+                    self.qf_encoder,
+                    self.qf,
+                ).view(batch_size, num_actions, 1)
+            }
+        )
+        # If we have a twin-Q network, compute its Q-values, too.
+        if self.twin_q:
+            output.update(
+                {
+                    QF_TWIN_PREDS: self._qf_forward_train_helper(
+                        temp_batch,
+                        self.qf_twin_encoder,
+                        self.qf_twin,
+                    ).view(batch_size, num_actions, 1)
+                }
+            )
+        del temp_batch
+        # Return
+        return output

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3, DreamerV3Config
+__all__ = [
+    "DreamerV3",
+    "DreamerV3Config",
+]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/dreamerv3.py ADDED Viewed

	@@ -0,0 +1,750 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+import gc
+import logging
+from typing import Any, Dict, Optional, Union
+import gymnasium as gym
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.algorithms.dreamerv3.dreamerv3_catalog import DreamerV3Catalog
+from ray.rllib.algorithms.dreamerv3.utils import do_symlog_obs
+from ray.rllib.algorithms.dreamerv3.utils.env_runner import DreamerV3EnvRunner
+from ray.rllib.algorithms.dreamerv3.utils.summaries import (
+    report_dreamed_eval_trajectory_vs_samples,
+    report_predicted_vs_sampled_obs,
+    report_sampling_and_replay_buffer,
+)
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils import deep_update
+from ray.rllib.utils.annotations import override, PublicAPI
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.numpy import one_hot
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    GARBAGE_COLLECTION_TIMER,
+    LEARN_ON_BATCH_TIMER,
+    LEARNER_RESULTS,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    NUM_ENV_STEPS_TRAINED_LIFETIME,
+    NUM_GRAD_UPDATES_LIFETIME,
+    NUM_SYNCH_WORKER_WEIGHTS,
+    SAMPLE_TIMER,
+    SYNCH_WORKER_WEIGHTS_TIMER,
+    TIMERS,
+)
+from ray.rllib.utils.replay_buffers.episode_replay_buffer import EpisodeReplayBuffer
+from ray.rllib.utils.typing import LearningRateOrSchedule
+logger = logging.getLogger(__name__)
+_, tf, _ = try_import_tf()
+class DreamerV3Config(AlgorithmConfig):
+    """Defines a configuration class from which a DreamerV3 can be built.
+    .. testcode::
+        from ray.rllib.algorithms.dreamerv3 import DreamerV3Config
+        config = (
+            DreamerV3Config()
+            .environment("CartPole-v1")
+            .training(
+                model_size="XS",
+                training_ratio=1,
+                # TODO
+                model={
+                    "batch_size_B": 1,
+                    "batch_length_T": 1,
+                    "horizon_H": 1,
+                    "gamma": 0.997,
+                    "model_size": "XS",
+                },
+            )
+        )
+        config = config.learners(num_learners=0)
+        # Build a Algorithm object from the config and run 1 training iteration.
+        algo = config.build()
+        # algo.train()
+        del algo
+    .. testoutput::
+        :hide:
+        ...
+    """
+    def __init__(self, algo_class=None):
+        """Initializes a DreamerV3Config instance."""
+        super().__init__(algo_class=algo_class or DreamerV3)
+        # fmt: off
+        # __sphinx_doc_begin__
+        # DreamerV3 specific settings:
+        self.model_size = "XS"
+        self.training_ratio = 1024
+        self.replay_buffer_config = {
+            "type": "EpisodeReplayBuffer",
+            "capacity": int(1e6),
+        }
+        self.world_model_lr = 1e-4
+        self.actor_lr = 3e-5
+        self.critic_lr = 3e-5
+        self.batch_size_B = 16
+        self.batch_length_T = 64
+        self.horizon_H = 15
+        self.gae_lambda = 0.95  # [1] eq. 7.
+        self.entropy_scale = 3e-4  # [1] eq. 11.
+        self.return_normalization_decay = 0.99  # [1] eq. 11 and 12.
+        self.train_critic = True
+        self.train_actor = True
+        self.intrinsic_rewards_scale = 0.1
+        self.world_model_grad_clip_by_global_norm = 1000.0
+        self.critic_grad_clip_by_global_norm = 100.0
+        self.actor_grad_clip_by_global_norm = 100.0
+        self.symlog_obs = "auto"
+        self.use_float16 = False
+        self.use_curiosity = False
+        # Reporting.
+        # DreamerV3 is super sample efficient and only needs very few episodes
+        # (normally) to learn. Leaving this at its default value would gravely
+        # underestimate the learning performance over the course of an experiment.
+        self.metrics_num_episodes_for_smoothing = 1
+        self.report_individual_batch_item_stats = False
+        self.report_dream_data = False
+        self.report_images_and_videos = False
+        self.gc_frequency_train_steps = 100
+        # Override some of AlgorithmConfig's default values with DreamerV3-specific
+        # values.
+        self.lr = None
+        self.framework_str = "tf2"
+        self.gamma = 0.997  # [1] eq. 7.
+        # Do not use! Set `batch_size_B` and `batch_length_T` instead.
+        self.train_batch_size = None
+        self.env_runner_cls = DreamerV3EnvRunner
+        self.num_env_runners = 0
+        self.rollout_fragment_length = 1
+        # Dreamer only runs on the new API stack.
+        self.enable_rl_module_and_learner = True
+        self.enable_env_runner_and_connector_v2 = True
+        # TODO (sven): DreamerV3 still uses its own EnvRunner class. This env-runner
+        #  does not use connectors. We therefore should not attempt to merge/broadcast
+        #  the connector states between EnvRunners (if >0). Note that this is only
+        #  relevant if num_env_runners > 0, which is normally not the case when using
+        #  this algo.
+        self.use_worker_filter_stats = False
+        # __sphinx_doc_end__
+        # fmt: on
+    @property
+    def batch_size_B_per_learner(self):
+        """Returns the batch_size_B per Learner worker.
+        Needed by some of the DreamerV3 loss math."""
+        return self.batch_size_B // (self.num_learners or 1)
+    @override(AlgorithmConfig)
+    def training(
+        self,
+        *,
+        model_size: Optional[str] = NotProvided,
+        training_ratio: Optional[float] = NotProvided,
+        gc_frequency_train_steps: Optional[int] = NotProvided,
+        batch_size_B: Optional[int] = NotProvided,
+        batch_length_T: Optional[int] = NotProvided,
+        horizon_H: Optional[int] = NotProvided,
+        gae_lambda: Optional[float] = NotProvided,
+        entropy_scale: Optional[float] = NotProvided,
+        return_normalization_decay: Optional[float] = NotProvided,
+        train_critic: Optional[bool] = NotProvided,
+        train_actor: Optional[bool] = NotProvided,
+        intrinsic_rewards_scale: Optional[float] = NotProvided,
+        world_model_lr: Optional[LearningRateOrSchedule] = NotProvided,
+        actor_lr: Optional[LearningRateOrSchedule] = NotProvided,
+        critic_lr: Optional[LearningRateOrSchedule] = NotProvided,
+        world_model_grad_clip_by_global_norm: Optional[float] = NotProvided,
+        critic_grad_clip_by_global_norm: Optional[float] = NotProvided,
+        actor_grad_clip_by_global_norm: Optional[float] = NotProvided,
+        symlog_obs: Optional[Union[bool, str]] = NotProvided,
+        use_float16: Optional[bool] = NotProvided,
+        replay_buffer_config: Optional[dict] = NotProvided,
+        use_curiosity: Optional[bool] = NotProvided,
+        **kwargs,
+    ) -> "DreamerV3Config":
+        """Sets the training related configuration.
+        Args:
+            model_size: The main switch for adjusting the overall model size. See [1]
+                (table B) for more information on the effects of this setting on the
+                model architecture.
+                Supported values are "XS", "S", "M", "L", "XL" (as per the paper), as
+                well as, "nano", "micro", "mini", and "XXS" (for RLlib's
+                implementation). See ray.rllib.algorithms.dreamerv3.utils.
+                __init__.py for the details on what exactly each size does to the layer
+                sizes, number of layers, etc..
+            training_ratio: The ratio of total steps trained (sum of the sizes of all
+                batches ever sampled from the replay buffer) over the total env steps
+                taken (in the actual environment, not the dreamed one). For example,
+                if the training_ratio is 1024 and the batch size is 1024, we would take
+                1 env step for every training update: 1024 / 1. If the training ratio
+                is 512 and the batch size is 1024, we would take 2 env steps and then
+                perform a single training update (on a 1024 batch): 1024 / 2.
+            gc_frequency_train_steps: The frequency (in training iterations) with which
+                we perform a `gc.collect()` calls at the end of a `training_step`
+                iteration. Doing this more often adds a (albeit very small) performance
+                overhead, but prevents memory leaks from becoming harmful.
+                TODO (sven): This might not be necessary anymore, but needs to be
+                 confirmed experimentally.
+            batch_size_B: The batch size (B) interpreted as number of rows (each of
+                length `batch_length_T`) to sample from the replay buffer in each
+                iteration.
+            batch_length_T: The batch length (T) interpreted as the length of each row
+                sampled from the replay buffer in each iteration. Note that
+                `batch_size_B` rows will be sampled in each iteration. Rows normally
+                contain consecutive data (consecutive timesteps from the same episode),
+                but there might be episode boundaries in a row as well.
+            horizon_H: The horizon (in timesteps) used to create dreamed data from the
+                world model, which in turn is used to train/update both actor- and
+                critic networks.
+            gae_lambda: The lambda parameter used for computing the GAE-style
+                value targets for the actor- and critic losses.
+            entropy_scale: The factor with which to multiply the entropy loss term
+                inside the actor loss.
+            return_normalization_decay: The decay value to use when computing the
+                running EMA values for return normalization (used in the actor loss).
+            train_critic: Whether to train the critic network. If False, `train_actor`
+                must also be False (cannot train actor w/o training the critic).
+            train_actor: Whether to train the actor network. If True, `train_critic`
+                must also be True (cannot train actor w/o training the critic).
+            intrinsic_rewards_scale: The factor to multiply intrinsic rewards with
+                before adding them to the extrinsic (environment) rewards.
+            world_model_lr: The learning rate or schedule for the world model optimizer.
+            actor_lr: The learning rate or schedule for the actor optimizer.
+            critic_lr: The learning rate or schedule for the critic optimizer.
+            world_model_grad_clip_by_global_norm: World model grad clipping value
+                (by global norm).
+            critic_grad_clip_by_global_norm: Critic grad clipping value
+                (by global norm).
+            actor_grad_clip_by_global_norm: Actor grad clipping value (by global norm).
+            symlog_obs: Whether to symlog observations or not. If set to "auto"
+                (default), will check for the environment's observation space and then
+                only symlog if not an image space.
+            use_float16: Whether to train with mixed float16 precision. In this mode,
+                model parameters are stored as float32, but all computations are
+                performed in float16 space (except for losses and distribution params
+                and outputs).
+            replay_buffer_config: Replay buffer config.
+                Only serves in DreamerV3 to set the capacity of the replay buffer.
+                Note though that in the paper ([1]) a size of 1M is used for all
+                benchmarks and there doesn't seem to be a good reason to change this
+                parameter.
+                Examples:
+                {
+                "type": "EpisodeReplayBuffer",
+                "capacity": 100000,
+                }
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        # Not fully supported/tested yet.
+        if use_curiosity is not NotProvided:
+            raise ValueError(
+                "`DreamerV3Config.curiosity` is not fully supported and tested yet! "
+                "It thus remains disabled for now."
+            )
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+        if model_size is not NotProvided:
+            self.model_size = model_size
+        if training_ratio is not NotProvided:
+            self.training_ratio = training_ratio
+        if gc_frequency_train_steps is not NotProvided:
+            self.gc_frequency_train_steps = gc_frequency_train_steps
+        if batch_size_B is not NotProvided:
+            self.batch_size_B = batch_size_B
+        if batch_length_T is not NotProvided:
+            self.batch_length_T = batch_length_T
+        if horizon_H is not NotProvided:
+            self.horizon_H = horizon_H
+        if gae_lambda is not NotProvided:
+            self.gae_lambda = gae_lambda
+        if entropy_scale is not NotProvided:
+            self.entropy_scale = entropy_scale
+        if return_normalization_decay is not NotProvided:
+            self.return_normalization_decay = return_normalization_decay
+        if train_critic is not NotProvided:
+            self.train_critic = train_critic
+        if train_actor is not NotProvided:
+            self.train_actor = train_actor
+        if intrinsic_rewards_scale is not NotProvided:
+            self.intrinsic_rewards_scale = intrinsic_rewards_scale
+        if world_model_lr is not NotProvided:
+            self.world_model_lr = world_model_lr
+        if actor_lr is not NotProvided:
+            self.actor_lr = actor_lr
+        if critic_lr is not NotProvided:
+            self.critic_lr = critic_lr
+        if world_model_grad_clip_by_global_norm is not NotProvided:
+            self.world_model_grad_clip_by_global_norm = (
+                world_model_grad_clip_by_global_norm
+            )
+        if critic_grad_clip_by_global_norm is not NotProvided:
+            self.critic_grad_clip_by_global_norm = critic_grad_clip_by_global_norm
+        if actor_grad_clip_by_global_norm is not NotProvided:
+            self.actor_grad_clip_by_global_norm = actor_grad_clip_by_global_norm
+        if symlog_obs is not NotProvided:
+            self.symlog_obs = symlog_obs
+        if use_float16 is not NotProvided:
+            self.use_float16 = use_float16
+        if replay_buffer_config is not NotProvided:
+            # Override entire `replay_buffer_config` if `type` key changes.
+            # Update, if `type` key remains the same or is not specified.
+            new_replay_buffer_config = deep_update(
+                {"replay_buffer_config": self.replay_buffer_config},
+                {"replay_buffer_config": replay_buffer_config},
+                False,
+                ["replay_buffer_config"],
+                ["replay_buffer_config"],
+            )
+            self.replay_buffer_config = new_replay_buffer_config["replay_buffer_config"]
+        return self
+    @override(AlgorithmConfig)
+    def reporting(
+        self,
+        *,
+        report_individual_batch_item_stats: Optional[bool] = NotProvided,
+        report_dream_data: Optional[bool] = NotProvided,
+        report_images_and_videos: Optional[bool] = NotProvided,
+        **kwargs,
+    ):
+        """Sets the reporting related configuration.
+        Args:
+            report_individual_batch_item_stats: Whether to include loss and other stats
+                per individual timestep inside the training batch in the result dict
+                returned by `training_step()`. If True, besides the `CRITIC_L_total`,
+                the individual critic loss values per batch row and time axis step
+                in the train batch (CRITIC_L_total_B_T) will also be part of the
+                results.
+            report_dream_data:  Whether to include the dreamed trajectory data in the
+                result dict returned by `training_step()`. If True, however, will
+                slice each reported item in the dream data down to the shape.
+                (H, B, t=0, ...), where H is the horizon and B is the batch size. The
+                original time axis will only be represented by the first timestep
+                to not make this data too large to handle.
+            report_images_and_videos: Whether to include any image/video data in the
+                result dict returned by `training_step()`.
+            **kwargs:
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        super().reporting(**kwargs)
+        if report_individual_batch_item_stats is not NotProvided:
+            self.report_individual_batch_item_stats = report_individual_batch_item_stats
+        if report_dream_data is not NotProvided:
+            self.report_dream_data = report_dream_data
+        if report_images_and_videos is not NotProvided:
+            self.report_images_and_videos = report_images_and_videos
+        return self
+    @override(AlgorithmConfig)
+    def validate(self) -> None:
+        # Call the super class' validation method first.
+        super().validate()
+        # Make sure, users are not using DreamerV3 yet for multi-agent:
+        if self.is_multi_agent:
+            self._value_error("DreamerV3 does NOT support multi-agent setups yet!")
+        # Make sure, we are configure for the new API stack.
+        if not self.enable_rl_module_and_learner:
+            self._value_error(
+                "DreamerV3 must be run with `config.api_stack("
+                "enable_rl_module_and_learner=True)`!"
+            )
+        # If run on several Learners, the provided batch_size_B must be a multiple
+        # of `num_learners`.
+        if self.num_learners > 1 and (self.batch_size_B % self.num_learners != 0):
+            self._value_error(
+                f"Your `batch_size_B` ({self.batch_size_B}) must be a multiple of "
+                f"`num_learners` ({self.num_learners}) in order for "
+                "DreamerV3 to be able to split batches evenly across your Learner "
+                "processes."
+            )
+        # Cannot train actor w/o critic.
+        if self.train_actor and not self.train_critic:
+            self._value_error(
+                "Cannot train actor network (`train_actor=True`) w/o training critic! "
+                "Make sure you either set `train_critic=True` or `train_actor=False`."
+            )
+        # Use DreamerV3 specific batch size settings.
+        if self.train_batch_size is not None:
+            self._value_error(
+                "`train_batch_size` should NOT be set! Use `batch_size_B` and "
+                "`batch_length_T` instead."
+            )
+        # Must be run with `EpisodeReplayBuffer` type.
+        if self.replay_buffer_config.get("type") != "EpisodeReplayBuffer":
+            self._value_error(
+                "DreamerV3 must be run with the `EpisodeReplayBuffer` type! None "
+                "other supported."
+            )
+    @override(AlgorithmConfig)
+    def get_default_learner_class(self):
+        if self.framework_str == "tf2":
+            from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_learner import (
+                DreamerV3TfLearner,
+            )
+            return DreamerV3TfLearner
+        else:
+            raise ValueError(f"The framework {self.framework_str} is not supported.")
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self) -> RLModuleSpec:
+        if self.framework_str == "tf2":
+            from ray.rllib.algorithms.dreamerv3.tf.dreamerv3_tf_rl_module import (
+                DreamerV3TfRLModule,
+            )
+            return RLModuleSpec(
+                module_class=DreamerV3TfRLModule, catalog_class=DreamerV3Catalog
+            )
+        else:
+            raise ValueError(f"The framework {self.framework_str} is not supported.")
+    @property
+    def share_module_between_env_runner_and_learner(self) -> bool:
+        # If we only have one local Learner (num_learners=0) and only
+        # one local EnvRunner (num_env_runners=0), share the RLModule
+        # between these two to avoid having to sync weights, ever.
+        return self.num_learners == 0 and self.num_env_runners == 0
+    @property
+    @override(AlgorithmConfig)
+    def _model_config_auto_includes(self) -> Dict[str, Any]:
+        return super()._model_config_auto_includes | {
+            "gamma": self.gamma,
+            "horizon_H": self.horizon_H,
+            "model_size": self.model_size,
+            "symlog_obs": self.symlog_obs,
+            "use_float16": self.use_float16,
+            "batch_length_T": self.batch_length_T,
+        }
+class DreamerV3(Algorithm):
+    """Implementation of the model-based DreamerV3 RL algorithm described in [1]."""
+    # TODO (sven): Deprecate/do-over the Algorithm.compute_single_action() API.
+    @override(Algorithm)
+    def compute_single_action(self, *args, **kwargs):
+        raise NotImplementedError(
+            "DreamerV3 does not support the `compute_single_action()` API. Refer to the"
+            " README here (https://github.com/ray-project/ray/tree/master/rllib/"
+            "algorithms/dreamerv3) to find more information on how to run action "
+            "inference with this algorithm."
+        )
+    @classmethod
+    @override(Algorithm)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return DreamerV3Config()
+    @override(Algorithm)
+    def setup(self, config: AlgorithmConfig):
+        super().setup(config)
+        # Share RLModule between EnvRunner and single (local) Learner instance.
+        # To avoid possibly expensive weight synching step.
+        if self.config.share_module_between_env_runner_and_learner:
+            assert self.env_runner.module is None
+            self.env_runner.module = self.learner_group._learner.module[
+                DEFAULT_MODULE_ID
+            ]
+        # Summarize (single-agent) RLModule (only once) here.
+        if self.config.framework_str == "tf2":
+            self.env_runner.module.dreamer_model.summary(expand_nested=True)
+        # Create a replay buffer for storing actual env samples.
+        self.replay_buffer = EpisodeReplayBuffer(
+            capacity=self.config.replay_buffer_config["capacity"],
+            batch_size_B=self.config.batch_size_B,
+            batch_length_T=self.config.batch_length_T,
+        )
+    @override(Algorithm)
+    def training_step(self) -> None:
+        # Push enough samples into buffer initially before we start training.
+        if self.training_iteration == 0:
+            logger.info(
+                "Filling replay buffer so it contains at least "
+                f"{self.config.batch_size_B * self.config.batch_length_T} timesteps "
+                "(required for a single train batch)."
+            )
+        # Have we sampled yet in this `training_step()` call?
+        have_sampled = False
+        with self.metrics.log_time((TIMERS, SAMPLE_TIMER)):
+            # Continue sampling from the actual environment (and add collected samples
+            # to our replay buffer) as long as we:
+            while (
+                # a) Don't have at least batch_size_B x batch_length_T timesteps stored
+                # in the buffer. This is the minimum needed to train.
+                self.replay_buffer.get_num_timesteps()
+                < (self.config.batch_size_B * self.config.batch_length_T)
+                # b) The computed `training_ratio` is >= the configured (desired)
+                # training ratio (meaning we should continue sampling).
+                or self.training_ratio >= self.config.training_ratio
+                # c) we have not sampled at all yet in this `training_step()` call.
+                or not have_sampled
+            ):
+                # Sample using the env runner's module.
+                episodes, env_runner_results = synchronous_parallel_sample(
+                    worker_set=self.env_runner_group,
+                    max_agent_steps=(
+                        self.config.rollout_fragment_length
+                        * self.config.num_envs_per_env_runner
+                    ),
+                    sample_timeout_s=self.config.sample_timeout_s,
+                    _uses_new_env_runners=True,
+                    _return_metrics=True,
+                )
+                self.metrics.merge_and_log_n_dicts(
+                    env_runner_results, key=ENV_RUNNER_RESULTS
+                )
+                # Add ongoing and finished episodes into buffer. The buffer will
+                # automatically take care of properly concatenating (by episode IDs)
+                # the different chunks of the same episodes, even if they come in via
+                # separate `add()` calls.
+                self.replay_buffer.add(episodes=episodes)
+                have_sampled = True
+                # We took B x T env steps.
+                env_steps_last_regular_sample = sum(len(eps) for eps in episodes)
+                total_sampled = env_steps_last_regular_sample
+                # If we have never sampled before (just started the algo and not
+                # recovered from a checkpoint), sample B random actions first.
+                if (
+                    self.metrics.peek(
+                        (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME),
+                        default=0,
+                    )
+                    == 0
+                ):
+                    _episodes, _env_runner_results = synchronous_parallel_sample(
+                        worker_set=self.env_runner_group,
+                        max_agent_steps=(
+                            self.config.batch_size_B * self.config.batch_length_T
+                            - env_steps_last_regular_sample
+                        ),
+                        sample_timeout_s=self.config.sample_timeout_s,
+                        random_actions=True,
+                        _uses_new_env_runners=True,
+                        _return_metrics=True,
+                    )
+                    self.metrics.merge_and_log_n_dicts(
+                        _env_runner_results, key=ENV_RUNNER_RESULTS
+                    )
+                    self.replay_buffer.add(episodes=_episodes)
+                    total_sampled += sum(len(eps) for eps in _episodes)
+        # Summarize environment interaction and buffer data.
+        report_sampling_and_replay_buffer(
+            metrics=self.metrics, replay_buffer=self.replay_buffer
+        )
+        # Continue sampling batch_size_B x batch_length_T sized batches from the buffer
+        # and using these to update our models (`LearnerGroup.update_from_batch()`)
+        # until the computed `training_ratio` is larger than the configured one, meaning
+        # we should go back and collect more samples again from the actual environment.
+        # However, when calculating the `training_ratio` here, we use only the
+        # trained steps in this very `training_step()` call over the most recent sample
+        # amount (`env_steps_last_regular_sample`), not the global values. This is to
+        # avoid a heavy overtraining at the very beginning when we have just pre-filled
+        # the buffer with the minimum amount of samples.
+        replayed_steps_this_iter = sub_iter = 0
+        while (
+            replayed_steps_this_iter / env_steps_last_regular_sample
+        ) < self.config.training_ratio:
+            # Time individual batch updates.
+            with self.metrics.log_time((TIMERS, LEARN_ON_BATCH_TIMER)):
+                logger.info(f"\tSub-iteration {self.training_iteration}/{sub_iter})")
+                # Draw a new sample from the replay buffer.
+                sample = self.replay_buffer.sample(
+                    batch_size_B=self.config.batch_size_B,
+                    batch_length_T=self.config.batch_length_T,
+                )
+                replayed_steps = self.config.batch_size_B * self.config.batch_length_T
+                replayed_steps_this_iter += replayed_steps
+                if isinstance(
+                    self.env_runner.env.single_action_space, gym.spaces.Discrete
+                ):
+                    sample["actions_ints"] = sample[Columns.ACTIONS]
+                    sample[Columns.ACTIONS] = one_hot(
+                        sample["actions_ints"],
+                        depth=self.env_runner.env.single_action_space.n,
+                    )
+                # Perform the actual update via our learner group.
+                learner_results = self.learner_group.update_from_batch(
+                    batch=SampleBatch(sample).as_multi_agent(),
+                    # TODO(sven): Maybe we should do this broadcase of global timesteps
+                    #  at the end, like for EnvRunner global env step counts. Maybe when
+                    #  we request the state from the Learners, we can - at the same
+                    #  time - send the current globally summed/reduced-timesteps.
+                    timesteps={
+                        NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek(
+                            (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME),
+                            default=0,
+                        )
+                    },
+                )
+                self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
+                sub_iter += 1
+                self.metrics.log_value(NUM_GRAD_UPDATES_LIFETIME, 1, reduce="sum")
+        # Log videos showing how the decoder produces observation predictions
+        # from the posterior states.
+        # Only every n iterations and only for the first sampled batch row
+        # (videos are `config.batch_length_T` frames long).
+        report_predicted_vs_sampled_obs(
+            # TODO (sven): DreamerV3 is single-agent only.
+            metrics=self.metrics,
+            sample=sample,
+            batch_size_B=self.config.batch_size_B,
+            batch_length_T=self.config.batch_length_T,
+            symlog_obs=do_symlog_obs(
+                self.env_runner.env.single_observation_space,
+                self.config.symlog_obs,
+            ),
+            do_report=(
+                self.config.report_images_and_videos
+                and self.training_iteration % 100 == 0
+            ),
+        )
+        # Log videos showing some of the dreamed trajectories and compare them with the
+        # actual trajectories from the train batch.
+        # Only every n iterations and only for the first sampled batch row AND first ts.
+        # (videos are `config.horizon_H` frames long originating from the observation
+        # at B=0 and T=0 in the train batch).
+        report_dreamed_eval_trajectory_vs_samples(
+            metrics=self.metrics,
+            sample=sample,
+            burn_in_T=0,
+            dreamed_T=self.config.horizon_H + 1,
+            dreamer_model=self.env_runner.module.dreamer_model,
+            symlog_obs=do_symlog_obs(
+                self.env_runner.env.single_observation_space,
+                self.config.symlog_obs,
+            ),
+            do_report=(
+                self.config.report_dream_data and self.training_iteration % 100 == 0
+            ),
+            framework=self.config.framework_str,
+        )
+        # Update weights - after learning on the LearnerGroup - on all EnvRunner
+        # workers.
+        with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
+            # Only necessary if RLModule is not shared between (local) EnvRunner and
+            # (local) Learner.
+            if not self.config.share_module_between_env_runner_and_learner:
+                self.metrics.log_value(NUM_SYNCH_WORKER_WEIGHTS, 1, reduce="sum")
+                self.env_runner_group.sync_weights(
+                    from_worker_or_learner_group=self.learner_group,
+                    inference_only=True,
+                )
+        # Try trick from https://medium.com/dive-into-ml-ai/dealing-with-memory-leak-
+        # issue-in-keras-model-training-e703907a6501
+        if self.config.gc_frequency_train_steps and (
+            self.training_iteration % self.config.gc_frequency_train_steps == 0
+        ):
+            with self.metrics.log_time((TIMERS, GARBAGE_COLLECTION_TIMER)):
+                gc.collect()
+        # Add train results and the actual training ratio to stats. The latter should
+        # be close to the configured `training_ratio`.
+        self.metrics.log_value("actual_training_ratio", self.training_ratio, window=1)
+    @property
+    def training_ratio(self) -> float:
+        """Returns the actual training ratio of this Algorithm (not the configured one).
+        The training ratio is copmuted by dividing the total number of steps
+        trained thus far (replayed from the buffer) over the total number of actual
+        env steps taken thus far.
+        """
+        eps = 0.0001
+        return self.metrics.peek(NUM_ENV_STEPS_TRAINED_LIFETIME, default=0) / (
+            (
+                self.metrics.peek(
+                    (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME),
+                    default=eps,
+                )
+                or eps
+            )
+        )
+    # TODO (sven): Remove this once DreamerV3 is on the new SingleAgentEnvRunner.
+    @PublicAPI
+    def __setstate__(self, state) -> None:
+        """Sts the algorithm to the provided state
+        Args:
+            state: The state dictionary to restore this `DreamerV3` instance to.
+                `state` may have been returned by a call to an `Algorithm`'s
+                `__getstate__()` method.
+        """
+        # Call the `Algorithm`'s `__setstate__()` method.
+        super().__setstate__(state=state)
+        # Assign the module to the local `EnvRunner` if sharing is enabled.
+        # Note, in `Learner.restore_from_path()` the module is first deleted
+        # and then a new one is built - therefore the worker has no
+        # longer a copy of the learner.
+        if self.config.share_module_between_env_runner_and_learner:
+            assert id(self.env_runner.module) != id(
+                self.learner_group._learner.module[DEFAULT_MODULE_ID]
+            )
+            self.env_runner.module = self.learner_group._learner.module[
+                DEFAULT_MODULE_ID
+            ]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/dreamerv3_catalog.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gymnasium as gym
+from ray.rllib.core.models.catalog import Catalog
+from ray.rllib.core.models.base import Encoder, Model
+from ray.rllib.utils import override
+class DreamerV3Catalog(Catalog):
+    """The Catalog class used to build all the models needed for DreamerV3 training."""
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        model_config_dict: dict,
+    ):
+        """Initializes a DreamerV3Catalog instance.
+        Args:
+            observation_space: The observation space of the environment.
+            action_space: The action space of the environment.
+            model_config_dict: The model config to use.
+        """
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+            model_config_dict=model_config_dict,
+        )
+        self.model_size = self._model_config_dict["model_size"]
+        self.is_img_space = len(self.observation_space.shape) in [2, 3]
+        self.is_gray_scale = (
+            self.is_img_space and len(self.observation_space.shape) == 2
+        )
+        # TODO (sven): We should work with sub-component configurations here,
+        #  and even try replacing all current Dreamer model components with
+        #  our default primitives. But for now, we'll construct the DreamerV3Model
+        #  directly in our `build_...()` methods.
+    @override(Catalog)
+    def build_encoder(self, framework: str) -> Encoder:
+        """Builds the World-Model's encoder network depending on the obs space."""
+        if framework != "tf2":
+            raise NotImplementedError
+        if self.is_img_space:
+            from ray.rllib.algorithms.dreamerv3.tf.models.components.cnn_atari import (
+                CNNAtari,
+            )
+            return CNNAtari(model_size=self.model_size)
+        else:
+            from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+            return MLP(model_size=self.model_size, name="vector_encoder")
+    def build_decoder(self, framework: str) -> Model:
+        """Builds the World-Model's decoder network depending on the obs space."""
+        if framework != "tf2":
+            raise NotImplementedError
+        if self.is_img_space:
+            from ray.rllib.algorithms.dreamerv3.tf.models.components import (
+                conv_transpose_atari,
+            )
+            return conv_transpose_atari.ConvTransposeAtari(
+                model_size=self.model_size,
+                gray_scaled=self.is_gray_scale,
+            )
+        else:
+            from ray.rllib.algorithms.dreamerv3.tf.models.components import (
+                vector_decoder,
+            )
+            return vector_decoder.VectorDecoder(
+                model_size=self.model_size,
+                observation_space=self.observation_space,
+            )

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/dreamerv3_learner.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.core.learner.learner import Learner
+from ray.rllib.utils.annotations import (
+    override,
+    OverrideToImplementCustomLogic_CallToSuperRecommended,
+)
+class DreamerV3Learner(Learner):
+    """DreamerV3 specific Learner class.
+    Only implements the `after_gradient_based_update()` method to define the logic
+    for updating the critic EMA-copy after each training step.
+    """
+    @OverrideToImplementCustomLogic_CallToSuperRecommended
+    @override(Learner)
+    def after_gradient_based_update(self, *, timesteps):
+        super().after_gradient_based_update(timesteps=timesteps)
+        # Update EMA weights of the critic.
+        for module_id, module in self.module._rl_modules.items():
+            module.critic.update_ema()

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/dreamerv3_rl_module.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+This file holds framework-agnostic components for DreamerV3's RLModule.
+"""
+import abc
+from typing import Any, Dict
+import gymnasium as gym
+import numpy as np
+from ray.rllib.algorithms.dreamerv3.utils import do_symlog_obs
+from ray.rllib.algorithms.dreamerv3.tf.models.actor_network import ActorNetwork
+from ray.rllib.algorithms.dreamerv3.tf.models.critic_network import CriticNetwork
+from ray.rllib.algorithms.dreamerv3.tf.models.dreamer_model import DreamerModel
+from ray.rllib.algorithms.dreamerv3.tf.models.world_model import WorldModel
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.policy.eager_tf_policy import _convert_to_tf
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.numpy import one_hot
+from ray.util.annotations import DeveloperAPI
+_, tf, _ = try_import_tf()
+@DeveloperAPI(stability="alpha")
+class DreamerV3RLModule(RLModule, abc.ABC):
+    @override(RLModule)
+    def setup(self):
+        super().setup()
+        # Gather model-relevant settings.
+        B = 1
+        T = self.model_config["batch_length_T"]
+        horizon_H = self.model_config["horizon_H"]
+        gamma = self.model_config["gamma"]
+        symlog_obs = do_symlog_obs(
+            self.observation_space,
+            self.model_config.get("symlog_obs", "auto"),
+        )
+        model_size = self.model_config["model_size"]
+        if self.model_config["use_float16"]:
+            tf.compat.v1.keras.layers.enable_v2_dtype_behavior()
+            tf.keras.mixed_precision.set_global_policy("mixed_float16")
+        # Build encoder and decoder from catalog.
+        self.encoder = self.catalog.build_encoder(framework=self.framework)
+        self.decoder = self.catalog.build_decoder(framework=self.framework)
+        # Build the world model (containing encoder and decoder).
+        self.world_model = WorldModel(
+            model_size=model_size,
+            observation_space=self.observation_space,
+            action_space=self.action_space,
+            batch_length_T=T,
+            encoder=self.encoder,
+            decoder=self.decoder,
+            symlog_obs=symlog_obs,
+        )
+        self.actor = ActorNetwork(
+            action_space=self.action_space,
+            model_size=model_size,
+        )
+        self.critic = CriticNetwork(
+            model_size=model_size,
+        )
+        # Build the final dreamer model (containing the world model).
+        self.dreamer_model = DreamerModel(
+            model_size=self.model_config["model_size"],
+            action_space=self.action_space,
+            world_model=self.world_model,
+            actor=self.actor,
+            critic=self.critic,
+            horizon=horizon_H,
+            gamma=gamma,
+        )
+        self.action_dist_cls = self.catalog.get_action_dist_cls(
+            framework=self.framework
+        )
+        # Perform a test `call()` to force building the dreamer model's variables.
+        if self.framework == "tf2":
+            test_obs = np.tile(
+                np.expand_dims(self.observation_space.sample(), (0, 1)),
+                reps=(B, T) + (1,) * len(self.observation_space.shape),
+            )
+            if isinstance(self.action_space, gym.spaces.Discrete):
+                test_actions = np.tile(
+                    np.expand_dims(
+                        one_hot(
+                            self.action_space.sample(),
+                            depth=self.action_space.n,
+                        ),
+                        (0, 1),
+                    ),
+                    reps=(B, T, 1),
+                )
+            else:
+                test_actions = np.tile(
+                    np.expand_dims(self.action_space.sample(), (0, 1)),
+                    reps=(B, T, 1),
+                )
+            self.dreamer_model(
+                inputs=None,
+                observations=_convert_to_tf(test_obs, dtype=tf.float32),
+                actions=_convert_to_tf(test_actions, dtype=tf.float32),
+                is_first=_convert_to_tf(np.ones((B, T)), dtype=tf.bool),
+                start_is_terminated_BxT=_convert_to_tf(
+                    np.zeros((B * T,)), dtype=tf.bool
+                ),
+                gamma=gamma,
+            )
+        # Initialize the critic EMA net:
+        self.critic.init_ema()
+    @override(RLModule)
+    def get_initial_state(self) -> Dict:
+        # Use `DreamerModel`'s `get_initial_state` method.
+        return self.dreamer_model.get_initial_state()
+    @override(RLModule)
+    def _forward_inference(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        # Call the Dreamer-Model's forward_inference method and return a dict.
+        actions, next_state = self.dreamer_model.forward_inference(
+            observations=batch[Columns.OBS],
+            previous_states=batch[Columns.STATE_IN],
+            is_first=batch["is_first"],
+        )
+        return {Columns.ACTIONS: actions, Columns.STATE_OUT: next_state}
+    @override(RLModule)
+    def _forward_exploration(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        # Call the Dreamer-Model's forward_exploration method and return a dict.
+        actions, next_state = self.dreamer_model.forward_exploration(
+            observations=batch[Columns.OBS],
+            previous_states=batch[Columns.STATE_IN],
+            is_first=batch["is_first"],
+        )
+        return {Columns.ACTIONS: actions, Columns.STATE_OUT: next_state}
+    @override(RLModule)
+    def _forward_train(self, batch: Dict[str, Any]):
+        # Call the Dreamer-Model's forward_train method and return its outputs as-is.
+        return self.dreamer_model.forward_train(
+            observations=batch[Columns.OBS],
+            actions=batch[Columns.ACTIONS],
+            is_first=batch["is_first"],
+        )

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (206 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/__pycache__/dreamerv3_tf_learner.cpython-311.pyc ADDED Viewed

Binary file (32.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/__pycache__/dreamerv3_tf_rl_module.cpython-311.pyc ADDED Viewed

Binary file (1.29 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py ADDED Viewed

	@@ -0,0 +1,915 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from typing import Any, Dict, Tuple
+import gymnasium as gym
+from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config
+from ray.rllib.algorithms.dreamerv3.dreamerv3_learner import DreamerV3Learner
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.learner import ParamDict
+from ray.rllib.core.learner.tf.tf_learner import TfLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+from ray.rllib.utils.tf_utils import symlog, two_hot, clip_gradients
+from ray.rllib.utils.typing import ModuleID, TensorType
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
+class DreamerV3TfLearner(DreamerV3Learner, TfLearner):
+    """Implements DreamerV3 losses and gradient-based update logic in TensorFlow.
+    The critic EMA-copy update step can be found in the `DreamerV3Learner` base class,
+    as it is framework independent.
+    We define 3 local TensorFlow optimizers for the sub components "world_model",
+    "actor", and "critic". Each of these optimizers might use a different learning rate,
+    epsilon parameter, and gradient clipping thresholds and procedures.
+    """
+    @override(TfLearner)
+    def configure_optimizers_for_module(
+        self, module_id: ModuleID, config: DreamerV3Config = None
+    ):
+        """Create the 3 optimizers for Dreamer learning: world_model, actor, critic.
+        The learning rates used are described in [1] and the epsilon values used here
+        - albeit probably not that important - are used by the author's own
+        implementation.
+        """
+        dreamerv3_module = self._module[module_id]
+        # World Model optimizer.
+        optim_world_model = tf.keras.optimizers.Adam(epsilon=1e-8)
+        optim_world_model.build(dreamerv3_module.world_model.trainable_variables)
+        params_world_model = self.get_parameters(dreamerv3_module.world_model)
+        self.register_optimizer(
+            module_id=module_id,
+            optimizer_name="world_model",
+            optimizer=optim_world_model,
+            params=params_world_model,
+            lr_or_lr_schedule=config.world_model_lr,
+        )
+        # Actor optimizer.
+        optim_actor = tf.keras.optimizers.Adam(epsilon=1e-5)
+        optim_actor.build(dreamerv3_module.actor.trainable_variables)
+        params_actor = self.get_parameters(dreamerv3_module.actor)
+        self.register_optimizer(
+            module_id=module_id,
+            optimizer_name="actor",
+            optimizer=optim_actor,
+            params=params_actor,
+            lr_or_lr_schedule=config.actor_lr,
+        )
+        # Critic optimizer.
+        optim_critic = tf.keras.optimizers.Adam(epsilon=1e-5)
+        optim_critic.build(dreamerv3_module.critic.trainable_variables)
+        params_critic = self.get_parameters(dreamerv3_module.critic)
+        self.register_optimizer(
+            module_id=module_id,
+            optimizer_name="critic",
+            optimizer=optim_critic,
+            params=params_critic,
+            lr_or_lr_schedule=config.critic_lr,
+        )
+    @override(TfLearner)
+    def postprocess_gradients_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: DreamerV3Config,
+        module_gradients_dict: Dict[str, Any],
+    ) -> ParamDict:
+        """Performs gradient clipping on the 3 module components' computed grads.
+        Note that different grad global-norm clip values are used for the 3
+        module components: world model, actor, and critic.
+        """
+        for optimizer_name, optimizer in self.get_optimizers_for_module(
+            module_id=module_id
+        ):
+            grads_sub_dict = self.filter_param_dict_for_optimizer(
+                module_gradients_dict, optimizer
+            )
+            # Figure out, which grad clip setting to use.
+            grad_clip = (
+                config.world_model_grad_clip_by_global_norm
+                if optimizer_name == "world_model"
+                else config.actor_grad_clip_by_global_norm
+                if optimizer_name == "actor"
+                else config.critic_grad_clip_by_global_norm
+            )
+            global_norm = clip_gradients(
+                grads_sub_dict,
+                grad_clip=grad_clip,
+                grad_clip_by="global_norm",
+            )
+            module_gradients_dict.update(grads_sub_dict)
+            # DreamerV3 stats have the format: [WORLD_MODEL|ACTOR|CRITIC]_[stats name].
+            self.metrics.log_dict(
+                {
+                    optimizer_name.upper() + "_gradients_global_norm": global_norm,
+                    optimizer_name.upper()
+                    + "_gradients_maxabs_after_clipping": (
+                        tf.reduce_max(
+                            [
+                                tf.reduce_max(tf.math.abs(g))
+                                for g in grads_sub_dict.values()
+                            ]
+                        )
+                    ),
+                },
+                key=module_id,
+                window=1,  # <- single items (should not be mean/ema-reduced over time).
+            )
+        return module_gradients_dict
+    @override(TfLearner)
+    def compute_gradients(
+        self,
+        loss_per_module,
+        gradient_tape,
+        **kwargs,
+    ):
+        # Override of the default gradient computation method.
+        # For DreamerV3, we need to compute gradients over the individual loss terms
+        # as otherwise, the world model's parameters would have their gradients also
+        # be influenced by the actor- and critic loss terms/gradient computations.
+        grads = {}
+        for component in ["world_model", "actor", "critic"]:
+            grads.update(
+                gradient_tape.gradient(
+                    # Take individual loss term from the registered metrics for
+                    # the main module.
+                    self.metrics.peek(
+                        (DEFAULT_MODULE_ID, component.upper() + "_L_total")
+                    ),
+                    self.filter_param_dict_for_optimizer(
+                        self._params, self.get_optimizer(optimizer_name=component)
+                    ),
+                )
+            )
+        del gradient_tape
+        return grads
+    @override(TfLearner)
+    def compute_loss_for_module(
+        self,
+        module_id: ModuleID,
+        config: DreamerV3Config,
+        batch: Dict[str, TensorType],
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        # World model losses.
+        prediction_losses = self._compute_world_model_prediction_losses(
+            config=config,
+            rewards_B_T=batch[Columns.REWARDS],
+            continues_B_T=(1.0 - tf.cast(batch["is_terminated"], tf.float32)),
+            fwd_out=fwd_out,
+        )
+        (
+            L_dyn_B_T,
+            L_rep_B_T,
+        ) = self._compute_world_model_dynamics_and_representation_loss(
+            config=config, fwd_out=fwd_out
+        )
+        L_dyn = tf.reduce_mean(L_dyn_B_T)
+        L_rep = tf.reduce_mean(L_rep_B_T)
+        # Make sure values for L_rep and L_dyn are the same (they only differ in their
+        # gradients).
+        tf.assert_equal(L_dyn, L_rep)
+        # Compute the actual total loss using fixed weights described in [1] eq. 4.
+        L_world_model_total_B_T = (
+            1.0 * prediction_losses["L_prediction_B_T"]
+            + 0.5 * L_dyn_B_T
+            + 0.1 * L_rep_B_T
+        )
+        # In the paper, it says to sum up timesteps, and average over
+        # batch (see eq. 4 in [1]). But Danijar's implementation only does
+        # averaging (over B and T), so we'll do this here as well. This is generally
+        # true for all other loss terms as well (we'll always just average, no summing
+        # over T axis!).
+        L_world_model_total = tf.reduce_mean(L_world_model_total_B_T)
+        # Log world model loss stats.
+        self.metrics.log_dict(
+            {
+                "WORLD_MODEL_learned_initial_h": (
+                    self.module[module_id].world_model.initial_h
+                ),
+                # Prediction losses.
+                # Decoder (obs) loss.
+                "WORLD_MODEL_L_decoder": prediction_losses["L_decoder"],
+                # Reward loss.
+                "WORLD_MODEL_L_reward": prediction_losses["L_reward"],
+                # Continue loss.
+                "WORLD_MODEL_L_continue": prediction_losses["L_continue"],
+                # Total.
+                "WORLD_MODEL_L_prediction": prediction_losses["L_prediction"],
+                # Dynamics loss.
+                "WORLD_MODEL_L_dynamics": L_dyn,
+                # Representation loss.
+                "WORLD_MODEL_L_representation": L_rep,
+                # Total loss.
+                "WORLD_MODEL_L_total": L_world_model_total,
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        # Add the predicted obs distributions for possible (video) summarization.
+        if config.report_images_and_videos:
+            self.metrics.log_value(
+                (module_id, "WORLD_MODEL_fwd_out_obs_distribution_means_b0xT"),
+                fwd_out["obs_distribution_means_BxT"][: self.config.batch_length_T],
+                reduce=None,  # No reduction, we want the tensor to stay in-tact.
+                window=1,  # <- single items (should not be mean/ema-reduced over time).
+            )
+        if config.report_individual_batch_item_stats:
+            # Log important world-model loss stats.
+            self.metrics.log_dict(
+                {
+                    "WORLD_MODEL_L_decoder_B_T": prediction_losses["L_decoder_B_T"],
+                    "WORLD_MODEL_L_reward_B_T": prediction_losses["L_reward_B_T"],
+                    "WORLD_MODEL_L_continue_B_T": prediction_losses["L_continue_B_T"],
+                    "WORLD_MODEL_L_prediction_B_T": (
+                        prediction_losses["L_prediction_B_T"]
+                    ),
+                    "WORLD_MODEL_L_dynamics_B_T": L_dyn_B_T,
+                    "WORLD_MODEL_L_representation_B_T": L_rep_B_T,
+                    "WORLD_MODEL_L_total_B_T": L_world_model_total_B_T,
+                },
+                key=module_id,
+                window=1,  # <- single items (should not be mean/ema-reduced over time).
+            )
+        # Dream trajectories starting in all internal states (h + z_posterior) that were
+        # computed during world model training.
+        # Everything goes in as BxT: We are starting a new dream trajectory at every
+        # actually encountered timestep in the batch, so we are creating B*T
+        # trajectories of len `horizon_H`.
+        dream_data = self.module[module_id].dreamer_model.dream_trajectory(
+            start_states={
+                "h": fwd_out["h_states_BxT"],
+                "z": fwd_out["z_posterior_states_BxT"],
+            },
+            start_is_terminated=tf.reshape(batch["is_terminated"], [-1]),  # -> BxT
+        )
+        if config.report_dream_data:
+            # To reduce this massive amount of data a little, slice out a T=1 piece
+            # from each stats that has the shape (H, BxT), meaning convert e.g.
+            # `rewards_dreamed_t0_to_H_BxT` into `rewards_dreamed_t0_to_H_Bx1`.
+            # This will reduce the amount of data to be transferred and reported
+            # by the factor of `batch_length_T`.
+            self.metrics.log_dict(
+                {
+                    # Replace 'T' with '1'.
+                    key[:-1] + "1": value[:, :: config.batch_length_T]
+                    for key, value in dream_data.items()
+                    if key.endswith("H_BxT")
+                },
+                key=(module_id, "dream_data"),
+                reduce=None,
+                window=1,  # <- single items (should not be mean/ema-reduced over time).
+            )
+        value_targets_t0_to_Hm1_BxT = self._compute_value_targets(
+            config=config,
+            # Learn critic in symlog'd space.
+            rewards_t0_to_H_BxT=dream_data["rewards_dreamed_t0_to_H_BxT"],
+            intrinsic_rewards_t1_to_H_BxT=(
+                dream_data["rewards_intrinsic_t1_to_H_B"]
+                if config.use_curiosity
+                else None
+            ),
+            continues_t0_to_H_BxT=dream_data["continues_dreamed_t0_to_H_BxT"],
+            value_predictions_t0_to_H_BxT=dream_data["values_dreamed_t0_to_H_BxT"],
+        )
+        self.metrics.log_value(
+            key=(module_id, "VALUE_TARGETS_H_BxT"),
+            value=value_targets_t0_to_Hm1_BxT,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        CRITIC_L_total = self._compute_critic_loss(
+            module_id=module_id,
+            config=config,
+            dream_data=dream_data,
+            value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT,
+        )
+        if config.train_actor:
+            ACTOR_L_total = self._compute_actor_loss(
+                module_id=module_id,
+                config=config,
+                dream_data=dream_data,
+                value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT,
+            )
+        else:
+            ACTOR_L_total = 0.0
+        # Return the total loss as a sum of all individual losses.
+        return L_world_model_total + CRITIC_L_total + ACTOR_L_total
+    def _compute_world_model_prediction_losses(
+        self,
+        *,
+        config: DreamerV3Config,
+        rewards_B_T: TensorType,
+        continues_B_T: TensorType,
+        fwd_out: Dict[str, TensorType],
+    ) -> Dict[str, TensorType]:
+        """Helper method computing all world-model related prediction losses.
+        Prediction losses are used to train the predictors of the world model, which
+        are: Reward predictor, continue predictor, and the decoder (which predicts
+        observations).
+        Args:
+            config: The DreamerV3Config to use.
+            rewards_B_T: The rewards batch in the shape (B, T) and of type float32.
+            continues_B_T: The continues batch in the shape (B, T) and of type float32
+                (1.0 -> continue; 0.0 -> end of episode).
+            fwd_out: The `forward_train` outputs of the DreamerV3RLModule.
+        """
+        # Learn to produce symlog'd observation predictions.
+        # If symlog is disabled (e.g. for uint8 image inputs), `obs_symlog_BxT` is the
+        # same as `obs_BxT`.
+        obs_BxT = fwd_out["sampled_obs_symlog_BxT"]
+        obs_distr_means = fwd_out["obs_distribution_means_BxT"]
+        # In case we wanted to construct a distribution object from the fwd out data,
+        # we would have to do it like this:
+        # obs_distr = tfp.distributions.MultivariateNormalDiag(
+        #    loc=obs_distr_means,
+        #    # Scale == 1.0.
+        #    # [2]: "Distributions The image predictor outputs the mean of a diagonal
+        #    # Gaussian likelihood with **unit variance** ..."
+        #    scale_diag=tf.ones_like(obs_distr_means),
+        # )
+        # Leave time dim folded (BxT) and flatten all other (e.g. image) dims.
+        obs_BxT = tf.reshape(obs_BxT, shape=[-1, tf.reduce_prod(obs_BxT.shape[1:])])
+        # Squared diff loss w/ sum(!) over all (already folded) obs dims.
+        # decoder_loss_BxT = SUM[ (obs_distr.loc - observations)^2 ]
+        # Note: This is described strangely in the paper (stating a neglogp loss here),
+        # but the author's own implementation actually uses simple MSE with the loc
+        # of the Gaussian.
+        decoder_loss_BxT = tf.reduce_sum(
+            tf.math.square(obs_distr_means - obs_BxT), axis=-1
+        )
+        # Unfold time rank back in.
+        decoder_loss_B_T = tf.reshape(
+            decoder_loss_BxT, (config.batch_size_B_per_learner, config.batch_length_T)
+        )
+        L_decoder = tf.reduce_mean(decoder_loss_B_T)
+        # The FiniteDiscrete reward bucket distribution computed by our reward
+        # predictor.
+        # [B x num_buckets].
+        reward_logits_BxT = fwd_out["reward_logits_BxT"]
+        # Learn to produce symlog'd reward predictions.
+        rewards_symlog_B_T = symlog(tf.cast(rewards_B_T, tf.float32))
+        # Fold time dim.
+        rewards_symlog_BxT = tf.reshape(rewards_symlog_B_T, shape=[-1])
+        # Two-hot encode.
+        two_hot_rewards_symlog_BxT = two_hot(rewards_symlog_BxT)
+        # two_hot_rewards_symlog_BxT=[B*T, num_buckets]
+        reward_log_pred_BxT = reward_logits_BxT - tf.math.reduce_logsumexp(
+            reward_logits_BxT, axis=-1, keepdims=True
+        )
+        # Multiply with two-hot targets and neg.
+        reward_loss_two_hot_BxT = -tf.reduce_sum(
+            reward_log_pred_BxT * two_hot_rewards_symlog_BxT, axis=-1
+        )
+        # Unfold time rank back in.
+        reward_loss_two_hot_B_T = tf.reshape(
+            reward_loss_two_hot_BxT,
+            (config.batch_size_B_per_learner, config.batch_length_T),
+        )
+        L_reward_two_hot = tf.reduce_mean(reward_loss_two_hot_B_T)
+        # Probabilities that episode continues, computed by our continue predictor.
+        # [B]
+        continue_distr = fwd_out["continue_distribution_BxT"]
+        # -log(p) loss
+        # Fold time dim.
+        continues_BxT = tf.reshape(continues_B_T, shape=[-1])
+        continue_loss_BxT = -continue_distr.log_prob(continues_BxT)
+        # Unfold time rank back in.
+        continue_loss_B_T = tf.reshape(
+            continue_loss_BxT, (config.batch_size_B_per_learner, config.batch_length_T)
+        )
+        L_continue = tf.reduce_mean(continue_loss_B_T)
+        # Sum all losses together as the "prediction" loss.
+        L_pred_B_T = decoder_loss_B_T + reward_loss_two_hot_B_T + continue_loss_B_T
+        L_pred = tf.reduce_mean(L_pred_B_T)
+        return {
+            "L_decoder_B_T": decoder_loss_B_T,
+            "L_decoder": L_decoder,
+            "L_reward": L_reward_two_hot,
+            "L_reward_B_T": reward_loss_two_hot_B_T,
+            "L_continue": L_continue,
+            "L_continue_B_T": continue_loss_B_T,
+            "L_prediction": L_pred,
+            "L_prediction_B_T": L_pred_B_T,
+        }
+    def _compute_world_model_dynamics_and_representation_loss(
+        self, *, config: DreamerV3Config, fwd_out: Dict[str, Any]
+    ) -> Tuple[TensorType, TensorType]:
+        """Helper method computing the world-model's dynamics and representation losses.
+        Args:
+            config: The DreamerV3Config to use.
+            fwd_out: The `forward_train` outputs of the DreamerV3RLModule.
+        Returns:
+            Tuple consisting of a) dynamics loss: Trains the prior network, predicting
+            z^ prior states from h-states and b) representation loss: Trains posterior
+            network, predicting z posterior states from h-states and (encoded)
+            observations.
+        """
+        # Actual distribution over stochastic internal states (z) produced by the
+        # encoder.
+        z_posterior_probs_BxT = fwd_out["z_posterior_probs_BxT"]
+        z_posterior_distr_BxT = tfp.distributions.Independent(
+            tfp.distributions.OneHotCategorical(probs=z_posterior_probs_BxT),
+            reinterpreted_batch_ndims=1,
+        )
+        # Actual distribution over stochastic internal states (z) produced by the
+        # dynamics network.
+        z_prior_probs_BxT = fwd_out["z_prior_probs_BxT"]
+        z_prior_distr_BxT = tfp.distributions.Independent(
+            tfp.distributions.OneHotCategorical(probs=z_prior_probs_BxT),
+            reinterpreted_batch_ndims=1,
+        )
+        # Stop gradient for encoder's z-outputs:
+        sg_z_posterior_distr_BxT = tfp.distributions.Independent(
+            tfp.distributions.OneHotCategorical(
+                probs=tf.stop_gradient(z_posterior_probs_BxT)
+            ),
+            reinterpreted_batch_ndims=1,
+        )
+        # Stop gradient for dynamics model's z-outputs:
+        sg_z_prior_distr_BxT = tfp.distributions.Independent(
+            tfp.distributions.OneHotCategorical(
+                probs=tf.stop_gradient(z_prior_probs_BxT)
+            ),
+            reinterpreted_batch_ndims=1,
+        )
+        # Implement free bits. According to [1]:
+        # "To avoid a degenerate solution where the dynamics are trivial to predict but
+        # contain not enough information about the inputs, we employ free bits by
+        # clipping the dynamics and representation losses below the value of
+        # 1 nat ≈ 1.44 bits. This disables them while they are already minimized well to
+        # focus the world model on its prediction loss"
+        L_dyn_BxT = tf.math.maximum(
+            1.0,
+            tfp.distributions.kl_divergence(
+                sg_z_posterior_distr_BxT, z_prior_distr_BxT
+            ),
+        )
+        # Unfold time rank back in.
+        L_dyn_B_T = tf.reshape(
+            L_dyn_BxT, (config.batch_size_B_per_learner, config.batch_length_T)
+        )
+        L_rep_BxT = tf.math.maximum(
+            1.0,
+            tfp.distributions.kl_divergence(
+                z_posterior_distr_BxT, sg_z_prior_distr_BxT
+            ),
+        )
+        # Unfold time rank back in.
+        L_rep_B_T = tf.reshape(
+            L_rep_BxT, (config.batch_size_B_per_learner, config.batch_length_T)
+        )
+        return L_dyn_B_T, L_rep_B_T
+    def _compute_actor_loss(
+        self,
+        *,
+        module_id: ModuleID,
+        config: DreamerV3Config,
+        dream_data: Dict[str, TensorType],
+        value_targets_t0_to_Hm1_BxT: TensorType,
+    ) -> TensorType:
+        """Helper method computing the actor's loss terms.
+        Args:
+            module_id: The module_id for which to compute the actor loss.
+            config: The DreamerV3Config to use.
+            dream_data: The data generated by dreaming for H steps (horizon) starting
+                from any BxT state (sampled from the buffer for the train batch).
+            value_targets_t0_to_Hm1_BxT: The computed value function targets of the
+                shape (t0 to H-1, BxT).
+        Returns:
+            The total actor loss tensor.
+        """
+        actor = self.module[module_id].actor
+        # Note: `scaled_value_targets_t0_to_Hm1_B` are NOT stop_gradient'd yet.
+        scaled_value_targets_t0_to_Hm1_B = self._compute_scaled_value_targets(
+            module_id=module_id,
+            config=config,
+            value_targets_t0_to_Hm1_BxT=value_targets_t0_to_Hm1_BxT,
+            value_predictions_t0_to_Hm1_BxT=dream_data["values_dreamed_t0_to_H_BxT"][
+                :-1
+            ],
+        )
+        # Actions actually taken in the dream.
+        actions_dreamed = tf.stop_gradient(dream_data["actions_dreamed_t0_to_H_BxT"])[
+            :-1
+        ]
+        actions_dreamed_dist_params_t0_to_Hm1_B = dream_data[
+            "actions_dreamed_dist_params_t0_to_H_BxT"
+        ][:-1]
+        dist_t0_to_Hm1_B = actor.get_action_dist_object(
+            actions_dreamed_dist_params_t0_to_Hm1_B
+        )
+        # Compute log(p)s of all possible actions in the dream.
+        if isinstance(self.module[module_id].actor.action_space, gym.spaces.Discrete):
+            # Note that when we create the Categorical action distributions, we compute
+            # unimix probs, then math.log these and provide these log(p) as "logits" to
+            # the Categorical. So here, we'll continue to work with log(p)s (not
+            # really "logits")!
+            logp_actions_t0_to_Hm1_B = actions_dreamed_dist_params_t0_to_Hm1_B
+            # Log probs of actions actually taken in the dream.
+            logp_actions_dreamed_t0_to_Hm1_B = tf.reduce_sum(
+                actions_dreamed * logp_actions_t0_to_Hm1_B,
+                axis=-1,
+            )
+            # First term of loss function. [1] eq. 11.
+            logp_loss_H_B = logp_actions_dreamed_t0_to_Hm1_B * tf.stop_gradient(
+                scaled_value_targets_t0_to_Hm1_B
+            )
+        # Box space.
+        else:
+            logp_actions_dreamed_t0_to_Hm1_B = dist_t0_to_Hm1_B.log_prob(
+                actions_dreamed
+            )
+            # First term of loss function. [1] eq. 11.
+            logp_loss_H_B = scaled_value_targets_t0_to_Hm1_B
+        assert len(logp_loss_H_B.shape) == 2
+        # Add entropy loss term (second term [1] eq. 11).
+        entropy_H_B = dist_t0_to_Hm1_B.entropy()
+        assert len(entropy_H_B.shape) == 2
+        entropy = tf.reduce_mean(entropy_H_B)
+        L_actor_reinforce_term_H_B = -logp_loss_H_B
+        L_actor_action_entropy_term_H_B = -config.entropy_scale * entropy_H_B
+        L_actor_H_B = L_actor_reinforce_term_H_B + L_actor_action_entropy_term_H_B
+        # Mask out everything that goes beyond a predicted continue=False boundary.
+        L_actor_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[
+            :-1
+        ]
+        L_actor = tf.reduce_mean(L_actor_H_B)
+        # Log important actor loss stats.
+        self.metrics.log_dict(
+            {
+                "ACTOR_L_total": L_actor,
+                "ACTOR_value_targets_pct95_ema": actor.ema_value_target_pct95,
+                "ACTOR_value_targets_pct5_ema": actor.ema_value_target_pct5,
+                "ACTOR_action_entropy": entropy,
+                # Individual loss terms.
+                "ACTOR_L_neglogp_reinforce_term": tf.reduce_mean(
+                    L_actor_reinforce_term_H_B
+                ),
+                "ACTOR_L_neg_entropy_term": tf.reduce_mean(
+                    L_actor_action_entropy_term_H_B
+                ),
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        if config.report_individual_batch_item_stats:
+            self.metrics.log_dict(
+                {
+                    "ACTOR_L_total_H_BxT": L_actor_H_B,
+                    "ACTOR_logp_actions_dreamed_H_BxT": (
+                        logp_actions_dreamed_t0_to_Hm1_B
+                    ),
+                    "ACTOR_scaled_value_targets_H_BxT": (
+                        scaled_value_targets_t0_to_Hm1_B
+                    ),
+                    "ACTOR_action_entropy_H_BxT": entropy_H_B,
+                    # Individual loss terms.
+                    "ACTOR_L_neglogp_reinforce_term_H_BxT": L_actor_reinforce_term_H_B,
+                    "ACTOR_L_neg_entropy_term_H_BxT": L_actor_action_entropy_term_H_B,
+                },
+                key=module_id,
+                window=1,  # <- single items (should not be mean/ema-reduced over time).
+            )
+        return L_actor
+    def _compute_critic_loss(
+        self,
+        *,
+        module_id: ModuleID,
+        config: DreamerV3Config,
+        dream_data: Dict[str, TensorType],
+        value_targets_t0_to_Hm1_BxT: TensorType,
+    ) -> TensorType:
+        """Helper method computing the critic's loss terms.
+        Args:
+            module_id: The ModuleID for which to compute the critic loss.
+            config: The DreamerV3Config to use.
+            dream_data: The data generated by dreaming for H steps (horizon) starting
+                from any BxT state (sampled from the buffer for the train batch).
+            value_targets_t0_to_Hm1_BxT: The computed value function targets of the
+                shape (t0 to H-1, BxT).
+        Returns:
+            The total critic loss tensor.
+        """
+        # B=BxT
+        H, B = dream_data["rewards_dreamed_t0_to_H_BxT"].shape[:2]
+        Hm1 = H - 1
+        # Note that value targets are NOT symlog'd and go from t0 to H-1, not H, like
+        # all the other dream data.
+        # From here on: B=BxT
+        value_targets_t0_to_Hm1_B = tf.stop_gradient(value_targets_t0_to_Hm1_BxT)
+        value_symlog_targets_t0_to_Hm1_B = symlog(value_targets_t0_to_Hm1_B)
+        # Fold time rank (for two_hot'ing).
+        value_symlog_targets_HxB = tf.reshape(value_symlog_targets_t0_to_Hm1_B, (-1,))
+        value_symlog_targets_two_hot_HxB = two_hot(value_symlog_targets_HxB)
+        # Unfold time rank.
+        value_symlog_targets_two_hot_t0_to_Hm1_B = tf.reshape(
+            value_symlog_targets_two_hot_HxB,
+            shape=[Hm1, B, value_symlog_targets_two_hot_HxB.shape[-1]],
+        )
+        # Get (B x T x probs) tensor from return distributions.
+        value_symlog_logits_HxB = dream_data["values_symlog_dreamed_logits_t0_to_HxBxT"]
+        # Unfold time rank and cut last time index to match value targets.
+        value_symlog_logits_t0_to_Hm1_B = tf.reshape(
+            value_symlog_logits_HxB,
+            shape=[H, B, value_symlog_logits_HxB.shape[-1]],
+        )[:-1]
+        values_log_pred_Hm1_B = (
+            value_symlog_logits_t0_to_Hm1_B
+            - tf.math.reduce_logsumexp(
+                value_symlog_logits_t0_to_Hm1_B, axis=-1, keepdims=True
+            )
+        )
+        # Multiply with two-hot targets and neg.
+        value_loss_two_hot_H_B = -tf.reduce_sum(
+            values_log_pred_Hm1_B * value_symlog_targets_two_hot_t0_to_Hm1_B, axis=-1
+        )
+        # Compute EMA regularization loss.
+        # Expected values (dreamed) from the EMA (slow critic) net.
+        # Note: Slow critic (EMA) outputs are already stop_gradient'd.
+        value_symlog_ema_t0_to_Hm1_B = tf.stop_gradient(
+            dream_data["v_symlog_dreamed_ema_t0_to_H_BxT"]
+        )[:-1]
+        # Fold time rank (for two_hot'ing).
+        value_symlog_ema_HxB = tf.reshape(value_symlog_ema_t0_to_Hm1_B, (-1,))
+        value_symlog_ema_two_hot_HxB = two_hot(value_symlog_ema_HxB)
+        # Unfold time rank.
+        value_symlog_ema_two_hot_t0_to_Hm1_B = tf.reshape(
+            value_symlog_ema_two_hot_HxB,
+            shape=[Hm1, B, value_symlog_ema_two_hot_HxB.shape[-1]],
+        )
+        # Compute ema regularizer loss.
+        # In the paper, it is not described how exactly to form this regularizer term
+        # and how to weigh it.
+        # So we follow Danijar's repo here:
+        # `reg = -dist.log_prob(sg(self.slow(traj).mean()))`
+        # with a weight of 1.0, where dist is the bucket'ized distribution output by the
+        # fast critic. sg=stop gradient; mean() -> use the expected EMA values.
+        # Multiply with two-hot targets and neg.
+        ema_regularization_loss_H_B = -tf.reduce_sum(
+            values_log_pred_Hm1_B * value_symlog_ema_two_hot_t0_to_Hm1_B, axis=-1
+        )
+        L_critic_H_B = value_loss_two_hot_H_B + ema_regularization_loss_H_B
+        # Mask out everything that goes beyond a predicted continue=False boundary.
+        L_critic_H_B *= tf.stop_gradient(dream_data["dream_loss_weights_t0_to_H_BxT"])[
+            :-1
+        ]
+        # Reduce over both H- (time) axis and B-axis (mean).
+        L_critic = tf.reduce_mean(L_critic_H_B)
+        # Log important critic loss stats.
+        self.metrics.log_dict(
+            {
+                "CRITIC_L_total": L_critic,
+                "CRITIC_L_neg_logp_of_value_targets": tf.reduce_mean(
+                    value_loss_two_hot_H_B
+                ),
+                "CRITIC_L_slow_critic_regularization": tf.reduce_mean(
+                    ema_regularization_loss_H_B
+                ),
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        if config.report_individual_batch_item_stats:
+            # Log important critic loss stats.
+            self.metrics.log_dict(
+                {
+                    # Symlog'd value targets. Critic learns to predict symlog'd values.
+                    "VALUE_TARGETS_symlog_H_BxT": value_symlog_targets_t0_to_Hm1_B,
+                    # Critic loss terms.
+                    "CRITIC_L_total_H_BxT": L_critic_H_B,
+                    "CRITIC_L_neg_logp_of_value_targets_H_BxT": value_loss_two_hot_H_B,
+                    "CRITIC_L_slow_critic_regularization_H_BxT": (
+                        ema_regularization_loss_H_B
+                    ),
+                },
+                key=module_id,
+                window=1,  # <- single items (should not be mean/ema-reduced over time).
+            )
+        return L_critic
+    def _compute_value_targets(
+        self,
+        *,
+        config: DreamerV3Config,
+        rewards_t0_to_H_BxT: TensorType,
+        intrinsic_rewards_t1_to_H_BxT: TensorType,
+        continues_t0_to_H_BxT: TensorType,
+        value_predictions_t0_to_H_BxT: TensorType,
+    ) -> TensorType:
+        """Helper method computing the value targets.
+        All args are (H, BxT, ...) and in non-symlog'd (real) reward space.
+        Non-symlog is important b/c log(a+b) != log(a) + log(b).
+        See [1] eq. 8 and 10.
+        Thus, targets are always returned in real (non-symlog'd space).
+        They need to be re-symlog'd before computing the critic loss from them (b/c the
+        critic produces predictions in symlog space).
+        Note that the original B and T ranks together form the new batch dimension
+        (folded into BxT) and the new time rank is the dream horizon (hence: [H, BxT]).
+        Variable names nomenclature:
+        `H`=1+horizon_H (start state + H steps dreamed),
+        `BxT`=batch_size * batch_length (meaning the original trajectory time rank has
+        been folded).
+        Rewards, continues, and value predictions are all of shape [t0-H, BxT]
+        (time-major), whereas returned targets are [t0 to H-1, B] (last timestep missing
+        b/c the target value equals vf prediction in that location anyways.
+        Args:
+            config: The DreamerV3Config to use.
+            rewards_t0_to_H_BxT: The reward predictor's predictions over the
+                dreamed trajectory t0 to H (and for the batch BxT).
+            intrinsic_rewards_t1_to_H_BxT: The predicted intrinsic rewards over the
+                dreamed trajectory t0 to H (and for the batch BxT).
+            continues_t0_to_H_BxT: The continue predictor's predictions over the
+                dreamed trajectory t0 to H (and for the batch BxT).
+            value_predictions_t0_to_H_BxT: The critic's value predictions over the
+                dreamed trajectory t0 to H (and for the batch BxT).
+        Returns:
+            The value targets in the shape: [t0toH-1, BxT]. Note that the last step (H)
+            does not require a value target as it matches the critic's value prediction
+            anyways.
+        """
+        # The first reward is irrelevant (not used for any VF target).
+        rewards_t1_to_H_BxT = rewards_t0_to_H_BxT[1:]
+        if intrinsic_rewards_t1_to_H_BxT is not None:
+            rewards_t1_to_H_BxT += intrinsic_rewards_t1_to_H_BxT
+        # In all the following, when building value targets for t=1 to T=H,
+        # exclude rewards & continues for t=1 b/c we don't need r1 or c1.
+        # The target (R1) for V1 is built from r2, c2, and V2/R2.
+        discount = continues_t0_to_H_BxT[1:] * config.gamma  # shape=[2-16, BxT]
+        Rs = [value_predictions_t0_to_H_BxT[-1]]  # Rs indices=[16]
+        intermediates = (
+            rewards_t1_to_H_BxT
+            + discount * (1 - config.gae_lambda) * value_predictions_t0_to_H_BxT[1:]
+        )
+        # intermediates.shape=[2-16, BxT]
+        # Loop through reversed timesteps (axis=1) from T+1 to t=2.
+        for t in reversed(range(discount.shape[0])):
+            Rs.append(intermediates[t] + discount[t] * config.gae_lambda * Rs[-1])
+        # Reverse along time axis and cut the last entry (value estimate at very end
+        # cannot be learnt from as it's the same as the ... well ... value estimate).
+        targets_t0toHm1_BxT = tf.stack(list(reversed(Rs))[:-1], axis=0)
+        # targets.shape=[t0 to H-1,BxT]
+        return targets_t0toHm1_BxT
+    def _compute_scaled_value_targets(
+        self,
+        *,
+        module_id: ModuleID,
+        config: DreamerV3Config,
+        value_targets_t0_to_Hm1_BxT: TensorType,
+        value_predictions_t0_to_Hm1_BxT: TensorType,
+    ) -> TensorType:
+        """Helper method computing the scaled value targets.
+        Args:
+            module_id: The module_id to compute value targets for.
+            config: The DreamerV3Config to use.
+            value_targets_t0_to_Hm1_BxT: The value targets computed by
+                `self._compute_value_targets` in the shape of (t0 to H-1, BxT)
+                and of type float32.
+            value_predictions_t0_to_Hm1_BxT: The critic's value predictions over the
+                dreamed trajectories (w/o the last timestep). The shape of this
+                tensor is (t0 to H-1, BxT) and the type is float32.
+        Returns:
+            The scaled value targets used by the actor for REINFORCE policy updates
+            (using scaled advantages). See [1] eq. 12 for more details.
+        """
+        actor = self.module[module_id].actor
+        value_targets_H_B = value_targets_t0_to_Hm1_BxT
+        value_predictions_H_B = value_predictions_t0_to_Hm1_BxT
+        # Compute S: [1] eq. 12.
+        Per_R_5 = tfp.stats.percentile(value_targets_H_B, 5)
+        Per_R_95 = tfp.stats.percentile(value_targets_H_B, 95)
+        # Update EMA values for 5 and 95 percentile, stored as tf variables under actor
+        # network.
+        # 5 percentile
+        new_val_pct5 = tf.where(
+            tf.math.is_nan(actor.ema_value_target_pct5),
+            # is NaN: Initial values: Just set.
+            Per_R_5,
+            # Later update (something already stored in EMA variable): Update EMA.
+            (
+                config.return_normalization_decay * actor.ema_value_target_pct5
+                + (1.0 - config.return_normalization_decay) * Per_R_5
+            ),
+        )
+        actor.ema_value_target_pct5.assign(new_val_pct5)
+        # 95 percentile
+        new_val_pct95 = tf.where(
+            tf.math.is_nan(actor.ema_value_target_pct95),
+            # is NaN: Initial values: Just set.
+            Per_R_95,
+            # Later update (something already stored in EMA variable): Update EMA.
+            (
+                config.return_normalization_decay * actor.ema_value_target_pct95
+                + (1.0 - config.return_normalization_decay) * Per_R_95
+            ),
+        )
+        actor.ema_value_target_pct95.assign(new_val_pct95)
+        # [1] eq. 11 (first term).
+        offset = actor.ema_value_target_pct5
+        invscale = tf.math.maximum(
+            1e-8, actor.ema_value_target_pct95 - actor.ema_value_target_pct5
+        )
+        scaled_value_targets_H_B = (value_targets_H_B - offset) / invscale
+        scaled_value_predictions_H_B = (value_predictions_H_B - offset) / invscale
+        # Return advantages.
+        return scaled_value_targets_H_B - scaled_value_predictions_H_B

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_rl_module.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.dreamerv3_rl_module import DreamerV3RLModule
+from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule
+from ray.rllib.utils.framework import try_import_tf
+tf1, tf, _ = try_import_tf()
+class DreamerV3TfRLModule(TfRLModule, DreamerV3RLModule):
+    """The tf-specific RLModule class for DreamerV3.
+    Serves mainly as a thin-wrapper around the `DreamerModel` (a tf.keras.Model) class.
+    """
+    framework = "tf2"

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/actor_network.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_gru_units,
+    get_num_z_categoricals,
+    get_num_z_classes,
+)
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
+class ActorNetwork(tf.keras.Model):
+    """The `actor` (policy net) of DreamerV3.
+    Consists of a simple MLP for Discrete actions and two MLPs for cont. actions (mean
+    and stddev).
+    Also contains two scalar variables to keep track of the percentile-5 and
+    percentile-95 values of the computed value targets within a batch. This is used to
+    compute the "scaled value targets" for actor learning. These two variables decay
+    over time exponentially (see [1] for more details).
+    """
+    def __init__(
+        self,
+        *,
+        model_size: str = "XS",
+        action_space: gym.Space,
+    ):
+        """Initializes an ActorNetwork instance.
+        Args:
+             model_size: The "Model Size" used according to [1] Appendix B.
+                Use None for manually setting the different network sizes.
+            action_space: The action space of the environment used.
+        """
+        super().__init__(name="actor")
+        self.model_size = model_size
+        self.action_space = action_space
+        # The EMA decay variables used for the [Percentile(R, 95%) - Percentile(R, 5%)]
+        # diff to scale value targets for the actor loss.
+        self.ema_value_target_pct5 = tf.Variable(
+            np.nan, trainable=False, name="value_target_pct5"
+        )
+        self.ema_value_target_pct95 = tf.Variable(
+            np.nan, trainable=False, name="value_target_pct95"
+        )
+        # For discrete actions, use a single MLP that computes logits.
+        if isinstance(self.action_space, Discrete):
+            self.mlp = MLP(
+                model_size=self.model_size,
+                output_layer_size=self.action_space.n,
+                name="actor_mlp",
+            )
+        # For cont. actions, use separate MLPs for Gaussian mean and stddev.
+        # TODO (sven): In the author's original code repo, this is NOT the case,
+        #  inputs are pushed through a shared MLP, then only the two output linear
+        #  layers are separate for std- and mean logits.
+        elif isinstance(action_space, Box):
+            output_layer_size = np.prod(action_space.shape)
+            self.mlp = MLP(
+                model_size=self.model_size,
+                output_layer_size=output_layer_size,
+                name="actor_mlp_mean",
+            )
+            self.std_mlp = MLP(
+                model_size=self.model_size,
+                output_layer_size=output_layer_size,
+                name="actor_mlp_std",
+            )
+        else:
+            raise ValueError(f"Invalid action space: {action_space}")
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type),
+                tf.TensorSpec(
+                    shape=[
+                        None,
+                        get_num_z_categoricals(model_size),
+                        get_num_z_classes(model_size),
+                    ],
+                    dtype=dl_type,
+                ),
+            ]
+        )(self.call)
+    def call(self, h, z):
+        """Performs a forward pass through this policy network.
+        Args:
+            h: The deterministic hidden state of the sequence model. [B, dim(h)].
+            z: The stochastic discrete representations of the original
+                observation input. [B, num_categoricals, num_classes].
+        """
+        # Flatten last two dims of z.
+        assert len(z.shape) == 3
+        z_shape = tf.shape(z)
+        z = tf.reshape(z, shape=(z_shape[0], -1))
+        assert len(z.shape) == 2
+        out = tf.concat([h, z], axis=-1)
+        out.set_shape(
+            [
+                None,
+                (
+                    get_num_z_categoricals(self.model_size)
+                    * get_num_z_classes(self.model_size)
+                    + get_gru_units(self.model_size)
+                ),
+            ]
+        )
+        # Send h-cat-z through MLP.
+        action_logits = tf.cast(self.mlp(out), tf.float32)
+        if isinstance(self.action_space, Discrete):
+            action_probs = tf.nn.softmax(action_logits)
+            # Add the unimix weighting (1% uniform) to the probs.
+            # See [1]: "Unimix categoricals: We parameterize the categorical
+            # distributions for the world model representations and dynamics, as well as
+            # for the actor network, as mixtures of 1% uniform and 99% neural network
+            # output to ensure a minimal amount of probability mass on every class and
+            # thus keep log probabilities and KL divergences well behaved."
+            action_probs = 0.99 * action_probs + 0.01 * (1.0 / self.action_space.n)
+            # Danijar's code does: distr = [Distr class](logits=tf.log(probs)).
+            # Not sure why we don't directly use the already available probs instead.
+            action_logits = tf.math.log(action_probs)
+            # Distribution parameters are the log(probs) directly.
+            distr_params = action_logits
+            distr = self.get_action_dist_object(distr_params)
+            action = tf.stop_gradient(distr.sample()) + (
+                action_probs - tf.stop_gradient(action_probs)
+            )
+        elif isinstance(self.action_space, Box):
+            # Send h-cat-z through MLP to compute stddev logits for Normal dist
+            std_logits = tf.cast(self.std_mlp(out), tf.float32)
+            # minstd, maxstd taken from [1] from configs.yaml
+            minstd = 0.1
+            maxstd = 1.0
+            # Distribution parameters are the squashed std_logits and the tanh'd
+            # mean logits.
+            # squash std_logits from (-inf, inf) to (minstd, maxstd)
+            std_logits = (maxstd - minstd) * tf.sigmoid(std_logits + 2.0) + minstd
+            mean_logits = tf.tanh(action_logits)
+            distr_params = tf.concat([mean_logits, std_logits], axis=-1)
+            distr = self.get_action_dist_object(distr_params)
+            action = distr.sample()
+        return action, distr_params
+    def get_action_dist_object(self, action_dist_params_T_B):
+        """Helper method to create an action distribution object from (T, B, ..) params.
+        Args:
+            action_dist_params_T_B: The time-major action distribution parameters.
+                This could be simply the logits (discrete) or a to-be-split-in-2
+                tensor for mean and stddev (continuous).
+        Returns:
+            The tfp action distribution object, from which one can sample, compute
+            log probs, entropy, etc..
+        """
+        if isinstance(self.action_space, gym.spaces.Discrete):
+            # Create the distribution object using the unimix'd logits.
+            distr = tfp.distributions.OneHotCategorical(
+                logits=action_dist_params_T_B,
+                dtype=tf.float32,
+            )
+        elif isinstance(self.action_space, gym.spaces.Box):
+            # Compute Normal distribution from action_logits and std_logits
+            loc, scale = tf.split(action_dist_params_T_B, 2, axis=-1)
+            distr = tfp.distributions.Normal(loc=loc, scale=scale)
+            # If action_space is a box with multiple dims, make individual dims
+            # independent.
+            distr = tfp.distributions.Independent(distr, len(self.action_space.shape))
+        else:
+            raise ValueError(f"Action space {self.action_space} not supported!")
+        return distr

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/cnn_atari.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from typing import Optional
+from ray.rllib.algorithms.dreamerv3.utils import get_cnn_multiplier
+from ray.rllib.utils.framework import try_import_tf
+_, tf, _ = try_import_tf()
+class CNNAtari(tf.keras.Model):
+    """An image encoder mapping 64x64 RGB images via 4 CNN layers into a 1D space."""
+    def __init__(
+        self,
+        *,
+        model_size: Optional[str] = "XS",
+        cnn_multiplier: Optional[int] = None,
+    ):
+        """Initializes a CNNAtari instance.
+        Args:
+            model_size: The "Model Size" used according to [1] Appendix B.
+                Use None for manually setting the `cnn_multiplier`.
+            cnn_multiplier: Optional override for the additional factor used to multiply
+                the number of filters with each CNN layer. Starting with
+                1 * `cnn_multiplier` filters in the first CNN layer, the number of
+                filters then increases via `2*cnn_multiplier`, `4*cnn_multiplier`, till
+                `8*cnn_multiplier`.
+        """
+        super().__init__(name="image_encoder")
+        cnn_multiplier = get_cnn_multiplier(model_size, override=cnn_multiplier)
+        # See appendix C in [1]:
+        # "We use a similar network architecture but employ layer normalization and
+        # SiLU as the activation function. For better framework support, we use
+        # same-padded convolutions with stride 2 and kernel size 3 instead of
+        # valid-padded convolutions with larger kernels ..."
+        # HOWEVER: In Danijar's DreamerV3 repo, kernel size=4 is used, so we use it
+        # here, too.
+        self.conv_layers = [
+            tf.keras.layers.Conv2D(
+                filters=1 * cnn_multiplier,
+                kernel_size=4,
+                strides=(2, 2),
+                padding="same",
+                # No bias or activation due to layernorm.
+                activation=None,
+                use_bias=False,
+            ),
+            tf.keras.layers.Conv2D(
+                filters=2 * cnn_multiplier,
+                kernel_size=4,
+                strides=(2, 2),
+                padding="same",
+                # No bias or activation due to layernorm.
+                activation=None,
+                use_bias=False,
+            ),
+            tf.keras.layers.Conv2D(
+                filters=4 * cnn_multiplier,
+                kernel_size=4,
+                strides=(2, 2),
+                padding="same",
+                # No bias or activation due to layernorm.
+                activation=None,
+                use_bias=False,
+            ),
+            # .. until output is 4 x 4 x [num_filters].
+            tf.keras.layers.Conv2D(
+                filters=8 * cnn_multiplier,
+                kernel_size=4,
+                strides=(2, 2),
+                padding="same",
+                # No bias or activation due to layernorm.
+                activation=None,
+                use_bias=False,
+            ),
+        ]
+        self.layer_normalizations = []
+        for _ in range(len(self.conv_layers)):
+            self.layer_normalizations.append(tf.keras.layers.LayerNormalization())
+        # -> 4 x 4 x num_filters -> now flatten.
+        self.flatten_layer = tf.keras.layers.Flatten(data_format="channels_last")
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(
+                shape=[None, 64, 64, 3],
+                dtype=tf.keras.mixed_precision.global_policy().compute_dtype
+                or tf.float32,
+            )
+        ]
+    )
+    def call(self, inputs):
+        """Performs a forward pass through the CNN Atari encoder.
+        Args:
+            inputs: The image inputs of shape (B, 64, 64, 3).
+        """
+        # [B, h, w] -> grayscale.
+        if len(inputs.shape) == 3:
+            inputs = tf.expand_dims(inputs, -1)
+        out = inputs
+        for conv_2d, layer_norm in zip(self.conv_layers, self.layer_normalizations):
+            out = tf.nn.silu(layer_norm(inputs=conv_2d(out)))
+        assert out.shape[1] == 4 and out.shape[2] == 4
+        return self.flatten_layer(out)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/conv_transpose_atari.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+[2] Mastering Atari with Discrete World Models - 2021
+D. Hafner, T. Lillicrap, M. Norouzi, J. Ba
+https://arxiv.org/pdf/2010.02193.pdf
+"""
+from typing import Optional
+import numpy as np
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_cnn_multiplier,
+    get_gru_units,
+    get_num_z_categoricals,
+    get_num_z_classes,
+)
+from ray.rllib.utils.framework import try_import_tf
+_, tf, _ = try_import_tf()
+class ConvTransposeAtari(tf.keras.Model):
+    """A Conv2DTranspose decoder to generate Atari images from a latent space.
+    Wraps an initial single linear layer with a stack of 4 Conv2DTranspose layers (with
+    layer normalization) and a diag Gaussian, from which we then sample the final image.
+    Sampling is done with a fixed stddev=1.0 and using the mean values coming from the
+    last Conv2DTranspose layer.
+    """
+    def __init__(
+        self,
+        *,
+        model_size: Optional[str] = "XS",
+        cnn_multiplier: Optional[int] = None,
+        gray_scaled: bool,
+    ):
+        """Initializes a ConvTransposeAtari instance.
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Use None for manually setting the `cnn_multiplier`.
+            cnn_multiplier: Optional override for the additional factor used to multiply
+                the number of filters with each CNN transpose layer. Starting with
+                8 * `cnn_multiplier` filters in the first CNN transpose layer, the
+                number of filters then decreases via `4*cnn_multiplier`,
+                `2*cnn_multiplier`, till `1*cnn_multiplier`.
+            gray_scaled: Whether the last Conv2DTranspose layer's output has only 1
+                color channel (gray_scaled=True) or 3 RGB channels (gray_scaled=False).
+        """
+        super().__init__(name="image_decoder")
+        self.model_size = model_size
+        cnn_multiplier = get_cnn_multiplier(self.model_size, override=cnn_multiplier)
+        # The shape going into the first Conv2DTranspose layer.
+        # We start with a 4x4 channels=8 "image".
+        self.input_dims = (4, 4, 8 * cnn_multiplier)
+        self.gray_scaled = gray_scaled
+        # See appendix B in [1]:
+        # "The decoder starts with a dense layer, followed by reshaping
+        # to 4 × 4 × C and then inverts the encoder architecture. ..."
+        self.dense_layer = tf.keras.layers.Dense(
+            units=int(np.prod(self.input_dims)),
+            activation=None,
+            use_bias=True,
+        )
+        # Inverse conv2d stack. See cnn_atari.py for corresponding Conv2D stack.
+        self.conv_transpose_layers = [
+            tf.keras.layers.Conv2DTranspose(
+                filters=4 * cnn_multiplier,
+                kernel_size=4,
+                strides=(2, 2),
+                padding="same",
+                # No bias or activation due to layernorm.
+                activation=None,
+                use_bias=False,
+            ),
+            tf.keras.layers.Conv2DTranspose(
+                filters=2 * cnn_multiplier,
+                kernel_size=4,
+                strides=(2, 2),
+                padding="same",
+                # No bias or activation due to layernorm.
+                activation=None,
+                use_bias=False,
+            ),
+            tf.keras.layers.Conv2DTranspose(
+                filters=1 * cnn_multiplier,
+                kernel_size=4,
+                strides=(2, 2),
+                padding="same",
+                # No bias or activation due to layernorm.
+                activation=None,
+                use_bias=False,
+            ),
+        ]
+        # Create one LayerNorm layer for each of the Conv2DTranspose layers.
+        self.layer_normalizations = []
+        for _ in range(len(self.conv_transpose_layers)):
+            self.layer_normalizations.append(tf.keras.layers.LayerNormalization())
+        # Important! No activation or layer norm for last layer as the outputs of
+        # this one go directly into the diag-gaussian as parameters.
+        self.output_conv2d_transpose = tf.keras.layers.Conv2DTranspose(
+            filters=1 if self.gray_scaled else 3,
+            kernel_size=4,
+            strides=(2, 2),
+            padding="same",
+            activation=None,
+            use_bias=True,  # Last layer does use bias (b/c has no LayerNorm).
+        )
+        # .. until output is 64 x 64 x 3 (or 1 for self.gray_scaled=True).
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type),
+                tf.TensorSpec(
+                    shape=[
+                        None,
+                        get_num_z_categoricals(model_size),
+                        get_num_z_classes(model_size),
+                    ],
+                    dtype=dl_type,
+                ),
+            ]
+        )(self.call)
+    def call(self, h, z):
+        """Performs a forward pass through the Conv2D transpose decoder.
+        Args:
+            h: The deterministic hidden state of the sequence model.
+            z: The sequence of stochastic discrete representations of the original
+                observation input. Note: `z` is not used for the dynamics predictor
+                model (which predicts z from h).
+        """
+        # Flatten last two dims of z.
+        assert len(z.shape) == 3
+        z_shape = tf.shape(z)
+        z = tf.reshape(z, shape=(z_shape[0], -1))
+        assert len(z.shape) == 2
+        input_ = tf.concat([h, z], axis=-1)
+        input_.set_shape(
+            [
+                None,
+                (
+                    get_num_z_categoricals(self.model_size)
+                    * get_num_z_classes(self.model_size)
+                    + get_gru_units(self.model_size)
+                ),
+            ]
+        )
+        # Feed through initial dense layer to get the right number of input nodes
+        # for the first conv2dtranspose layer.
+        out = self.dense_layer(input_)
+        # Reshape to image format.
+        out = tf.reshape(out, shape=(-1,) + self.input_dims)
+        # Pass through stack of Conv2DTransport layers (and layer norms).
+        for conv_transpose_2d, layer_norm in zip(
+            self.conv_transpose_layers, self.layer_normalizations
+        ):
+            out = tf.nn.silu(layer_norm(inputs=conv_transpose_2d(out)))
+        # Last output conv2d-transpose layer:
+        out = self.output_conv2d_transpose(out)
+        out += 0.5  # See Danijar's code
+        out_shape = tf.shape(out)
+        # Interpret output as means of a diag-Gaussian with std=1.0:
+        # From [2]:
+        # "Distributions: The image predictor outputs the mean of a diagonal Gaussian
+        # likelihood with unit variance, ..."
+        # Reshape `out` for the diagonal multi-variate Gaussian (each pixel is its own
+        # independent (b/c diagonal co-variance matrix) variable).
+        loc = tf.reshape(out, shape=(out_shape[0], -1))
+        return loc

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/components/vector_decoder.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+import gymnasium as gym
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_gru_units,
+    get_num_z_categoricals,
+    get_num_z_classes,
+)
+from ray.rllib.utils.framework import try_import_tf
+_, tf, _ = try_import_tf()
+class VectorDecoder(tf.keras.Model):
+    """A simple vector decoder to reproduce non-image (1D vector) observations.
+    Wraps an MLP for mean parameter computations and a Gaussian distribution,
+    from which we then sample using these mean values and a fixed stddev of 1.0.
+    """
+    def __init__(
+        self,
+        *,
+        model_size: str = "XS",
+        observation_space: gym.Space,
+    ):
+        """Initializes a VectorDecoder instance.
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+                Determines the exact size of the underlying MLP.
+            observation_space: The observation space to decode back into. This must
+                be a Box of shape (d,), where d >= 1.
+        """
+        super().__init__(name="vector_decoder")
+        self.model_size = model_size
+        assert (
+            isinstance(observation_space, gym.spaces.Box)
+            and len(observation_space.shape) == 1
+        )
+        self.mlp = MLP(
+            model_size=model_size,
+            output_layer_size=observation_space.shape[0],
+        )
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type),
+                tf.TensorSpec(
+                    shape=[
+                        None,
+                        get_num_z_categoricals(model_size),
+                        get_num_z_classes(model_size),
+                    ],
+                    dtype=dl_type,
+                ),
+            ]
+        )(self.call)
+    def call(self, h, z):
+        """Performs a forward pass through the vector encoder.
+        Args:
+            h: The deterministic hidden state of the sequence model. [B, dim(h)].
+            z: The stochastic discrete representations of the original
+                observation input. [B, num_categoricals, num_classes].
+        """
+        # Flatten last two dims of z.
+        assert len(z.shape) == 3
+        z_shape = tf.shape(z)
+        z = tf.reshape(z, shape=(z_shape[0], -1))
+        assert len(z.shape) == 2
+        out = tf.concat([h, z], axis=-1)
+        out.set_shape(
+            [
+                None,
+                (
+                    get_num_z_categoricals(self.model_size)
+                    * get_num_z_classes(self.model_size)
+                    + get_gru_units(self.model_size)
+                ),
+            ]
+        )
+        # Send h-cat-z through MLP to get mean values of diag gaussian.
+        loc = self.mlp(out)
+        # Return only the predicted observations (mean, no sample).
+        return loc

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/critic_network.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor_layer import (
+    RewardPredictorLayer,
+)
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_gru_units,
+    get_num_z_categoricals,
+    get_num_z_classes,
+)
+from ray.rllib.utils.framework import try_import_tf
+_, tf, _ = try_import_tf()
+class CriticNetwork(tf.keras.Model):
+    """The critic network described in [1], predicting values for policy learning.
+    Contains a copy of itself (EMA net) for weight regularization.
+    The EMA net is updated after each train step via EMA (using the `ema_decay`
+    parameter and the actual critic's weights). The EMA net is NOT used for target
+    computations (we use the actual critic for that), its only purpose is to compute a
+    weights regularizer term for the critic's loss such that the actual critic does not
+    move too quickly.
+    """
+    def __init__(
+        self,
+        *,
+        model_size: str = "XS",
+        num_buckets: int = 255,
+        lower_bound: float = -20.0,
+        upper_bound: float = 20.0,
+        ema_decay: float = 0.98,
+    ):
+        """Initializes a CriticNetwork instance.
+        Args:
+            model_size: The "Model Size" used according to [1] Appendinx B.
+               Use None for manually setting the different network sizes.
+            num_buckets: The number of buckets to create. Note that the number of
+                possible symlog'd outcomes from the used distribution is
+                `num_buckets` + 1:
+                lower_bound --bucket-- o[1] --bucket-- o[2] ... --bucket-- upper_bound
+                o=outcomes
+                lower_bound=o[0]
+                upper_bound=o[num_buckets]
+            lower_bound: The symlog'd lower bound for a possible reward value.
+                Note that a value of -20.0 here already allows individual (actual env)
+                rewards to be as low as -400M. Buckets will be created between
+                `lower_bound` and `upper_bound`.
+            upper_bound: The symlog'd upper bound for a possible reward value.
+                Note that a value of +20.0 here already allows individual (actual env)
+                rewards to be as high as 400M. Buckets will be created between
+                `lower_bound` and `upper_bound`.
+            ema_decay: The weight to use for updating the weights of the critic's copy
+                vs the actual critic. After each training update, the EMA copy of the
+                critic gets updated according to:
+                ema_net=(`ema_decay`*ema_net) + (1.0-`ema_decay`)*critic_net
+                The EMA copy of the critic is used inside the critic loss function only
+                to produce a regularizer term against the current critic's weights, NOT
+                to compute any target values.
+        """
+        super().__init__(name="critic")
+        self.model_size = model_size
+        self.ema_decay = ema_decay
+        # "Fast" critic network(s) (mlp + reward-pred-layer). This is the network
+        # we actually train with our critic loss.
+        # IMPORTANT: We also use this to compute the return-targets, BUT we regularize
+        # the critic loss term such that the weights of this fast critic stay close
+        # to the EMA weights (see below).
+        self.mlp = MLP(
+            model_size=self.model_size,
+            output_layer_size=None,
+        )
+        self.return_layer = RewardPredictorLayer(
+            num_buckets=num_buckets,
+            lower_bound=lower_bound,
+            upper_bound=upper_bound,
+        )
+        # Weights-EMA (EWMA) containing networks for critic loss (similar to a
+        # target net, BUT not used to compute anything, just for the
+        # weights regularizer term inside the critic loss).
+        self.mlp_ema = MLP(
+            model_size=self.model_size,
+            output_layer_size=None,
+            trainable=False,
+        )
+        self.return_layer_ema = RewardPredictorLayer(
+            num_buckets=num_buckets,
+            lower_bound=lower_bound,
+            upper_bound=upper_bound,
+            trainable=False,
+        )
+        # Trace self.call.
+        dl_type = tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        self.call = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, get_gru_units(model_size)], dtype=dl_type),
+                tf.TensorSpec(
+                    shape=[
+                        None,
+                        get_num_z_categoricals(model_size),
+                        get_num_z_classes(model_size),
+                    ],
+                    dtype=dl_type,
+                ),
+                tf.TensorSpec(shape=[], dtype=tf.bool),
+            ]
+        )(self.call)
+    def call(self, h, z, use_ema):
+        """Performs a forward pass through the critic network.
+        Args:
+            h: The deterministic hidden state of the sequence model. [B, dim(h)].
+            z: The stochastic discrete representations of the original
+                observation input. [B, num_categoricals, num_classes].
+            use_ema: Whether to use the EMA-copy of the critic instead of the actual
+                critic to perform this computation.
+        """
+        # Flatten last two dims of z.
+        assert len(z.shape) == 3
+        z_shape = tf.shape(z)
+        z = tf.reshape(z, shape=(z_shape[0], -1))
+        assert len(z.shape) == 2
+        out = tf.concat([h, z], axis=-1)
+        out.set_shape(
+            [
+                None,
+                (
+                    get_num_z_categoricals(self.model_size)
+                    * get_num_z_classes(self.model_size)
+                    + get_gru_units(self.model_size)
+                ),
+            ]
+        )
+        if not use_ema:
+            # Send h-cat-z through MLP.
+            out = self.mlp(out)
+            # Return expected return OR (expected return, probs of bucket values).
+            return self.return_layer(out)
+        else:
+            out = self.mlp_ema(out)
+            return self.return_layer_ema(out)
+    def init_ema(self) -> None:
+        """Initializes the EMA-copy of the critic from the critic's weights.
+        After calling this method, the two networks have identical weights.
+        """
+        vars = self.mlp.trainable_variables + self.return_layer.trainable_variables
+        vars_ema = self.mlp_ema.variables + self.return_layer_ema.variables
+        assert len(vars) == len(vars_ema) and len(vars) > 0
+        for var, var_ema in zip(vars, vars_ema):
+            assert var is not var_ema
+            var_ema.assign(var)
+    def update_ema(self) -> None:
+        """Updates the EMA-copy of the critic according to the update formula:
+        ema_net=(`ema_decay`*ema_net) + (1.0-`ema_decay`)*critic_net
+        """
+        vars = self.mlp.trainable_variables + self.return_layer.trainable_variables
+        vars_ema = self.mlp_ema.variables + self.return_layer_ema.variables
+        assert len(vars) == len(vars_ema) and len(vars) > 0
+        for var, var_ema in zip(vars, vars_ema):
+            var_ema.assign(self.ema_decay * var_ema + (1.0 - self.ema_decay) * var)

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/disagree_networks.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import (
+    RepresentationLayer,
+)
+from ray.rllib.utils.framework import try_import_tf, try_import_tfp
+_, tf, _ = try_import_tf()
+tfp = try_import_tfp()
+class DisagreeNetworks(tf.keras.Model):
+    """Predict the RSSM's z^(t+1), given h(t), z^(t), and a(t).
+    Disagreement (stddev) between the N networks in this model on what the next z^ would
+    be are used to produce intrinsic rewards for enhanced, curiosity-based exploration.
+    TODO
+    """
+    def __init__(self, *, num_networks, model_size, intrinsic_rewards_scale):
+        super().__init__(name="disagree_networks")
+        self.model_size = model_size
+        self.num_networks = num_networks
+        self.intrinsic_rewards_scale = intrinsic_rewards_scale
+        self.mlps = []
+        self.representation_layers = []
+        for _ in range(self.num_networks):
+            self.mlps.append(
+                MLP(
+                    model_size=self.model_size,
+                    output_layer_size=None,
+                    trainable=True,
+                )
+            )
+            self.representation_layers.append(
+                RepresentationLayer(model_size=self.model_size, name="disagree")
+            )
+    def call(self, inputs, z, a, training=None):
+        return self.forward_train(a=a, h=inputs, z=z)
+    def compute_intrinsic_rewards(self, h, z, a):
+        forward_train_outs = self.forward_train(a=a, h=h, z=z)
+        B = tf.shape(h)[0]
+        # Intrinsic rewards are computed as:
+        # Stddev (between the different nets) of the 32x32 discrete, stochastic
+        # probabilities. Meaning that if the larger the disagreement
+        # (stddev) between the nets on what the probabilities for the different
+        # classes should be, the higher the intrinsic reward.
+        z_predicted_probs_N_B = forward_train_outs["z_predicted_probs_N_HxB"]
+        N = len(z_predicted_probs_N_B)
+        z_predicted_probs_N_B = tf.stack(z_predicted_probs_N_B, axis=0)
+        # Flatten z-dims (num_categoricals x num_classes).
+        z_predicted_probs_N_B = tf.reshape(z_predicted_probs_N_B, shape=(N, B, -1))
+        # Compute stddevs over all disagree nets (axis=0).
+        # Mean over last axis ([num categoricals] x [num classes] folded axis).
+        stddevs_B_mean = tf.reduce_mean(
+            tf.math.reduce_std(z_predicted_probs_N_B, axis=0),
+            axis=-1,
+        )
+        # TEST:
+        stddevs_B_mean -= tf.reduce_mean(stddevs_B_mean)
+        # END TEST
+        return {
+            "rewards_intrinsic": stddevs_B_mean * self.intrinsic_rewards_scale,
+            "forward_train_outs": forward_train_outs,
+        }
+    def forward_train(self, a, h, z):
+        HxB = tf.shape(h)[0]
+        # Fold z-dims.
+        z = tf.reshape(z, shape=(HxB, -1))
+        # Concat all input components (h, z, and a).
+        inputs_ = tf.stop_gradient(tf.concat([h, z, a], axis=-1))
+        z_predicted_probs_N_HxB = [
+            repr(mlp(inputs_))[1]  # [0]=sample; [1]=returned probs
+            for mlp, repr in zip(self.mlps, self.representation_layers)
+        ]
+        # shape=(N, HxB, [num categoricals], [num classes]); N=number of disagree nets.
+        # HxB -> folded horizon_H x batch_size_B (from dreamed data).
+        return {"z_predicted_probs_N_HxB": z_predicted_probs_N_HxB}

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/dreamer_model.py ADDED Viewed

	@@ -0,0 +1,606 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+import re
+import gymnasium as gym
+import numpy as np
+from ray.rllib.algorithms.dreamerv3.tf.models.disagree_networks import DisagreeNetworks
+from ray.rllib.algorithms.dreamerv3.tf.models.actor_network import ActorNetwork
+from ray.rllib.algorithms.dreamerv3.tf.models.critic_network import CriticNetwork
+from ray.rllib.algorithms.dreamerv3.tf.models.world_model import WorldModel
+from ray.rllib.algorithms.dreamerv3.utils import (
+    get_gru_units,
+    get_num_z_categoricals,
+    get_num_z_classes,
+)
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.tf_utils import inverse_symlog
+_, tf, _ = try_import_tf()
+class DreamerModel(tf.keras.Model):
+    """The main tf-keras model containing all necessary components for DreamerV3.
+    Includes:
+    - The world model with encoder, decoder, sequence-model (RSSM), dynamics
+    (generates prior z-state), and "posterior" model (generates posterior z-state).
+    Predicts env dynamics and produces dreamed trajectories for actor- and critic
+    learning.
+    - The actor network (policy).
+    - The critic network for value function prediction.
+    """
+    def __init__(
+        self,
+        *,
+        model_size: str = "XS",
+        action_space: gym.Space,
+        world_model: WorldModel,
+        actor: ActorNetwork,
+        critic: CriticNetwork,
+        horizon: int,
+        gamma: float,
+        use_curiosity: bool = False,
+        intrinsic_rewards_scale: float = 0.1,
+    ):
+        """Initializes a DreamerModel instance.
+        Args:
+             model_size: The "Model Size" used according to [1] Appendinx B.
+                Use None for manually setting the different network sizes.
+             action_space: The action space of the environment used.
+             world_model: The WorldModel component.
+             actor: The ActorNetwork component.
+             critic: The CriticNetwork component.
+             horizon: The dream horizon to use when creating dreamed trajectories.
+        """
+        super().__init__(name="dreamer_model")
+        self.model_size = model_size
+        self.action_space = action_space
+        self.use_curiosity = use_curiosity
+        self.world_model = world_model
+        self.actor = actor
+        self.critic = critic
+        self.horizon = horizon
+        self.gamma = gamma
+        self._comp_dtype = (
+            tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        )
+        self.disagree_nets = None
+        if self.use_curiosity:
+            self.disagree_nets = DisagreeNetworks(
+                num_networks=8,
+                model_size=self.model_size,
+                intrinsic_rewards_scale=intrinsic_rewards_scale,
+            )
+        self.dream_trajectory = tf.function(
+            input_signature=[
+                {
+                    "h": tf.TensorSpec(
+                        shape=[
+                            None,
+                            get_gru_units(self.model_size),
+                        ],
+                        dtype=self._comp_dtype,
+                    ),
+                    "z": tf.TensorSpec(
+                        shape=[
+                            None,
+                            get_num_z_categoricals(self.model_size),
+                            get_num_z_classes(self.model_size),
+                        ],
+                        dtype=self._comp_dtype,
+                    ),
+                },
+                tf.TensorSpec(shape=[None], dtype=tf.bool),
+            ]
+        )(self.dream_trajectory)
+    def call(
+        self,
+        inputs,
+        observations,
+        actions,
+        is_first,
+        start_is_terminated_BxT,
+        gamma,
+    ):
+        """Main call method for building this model in order to generate its variables.
+        Note: This method should NOT be used by users directly. It's purpose is only to
+        perform all forward passes necessary to define all variables of the DreamerV3.
+        """
+        # Forward passes through all models are enough to build all trainable and
+        # non-trainable variables:
+        # World model.
+        results = self.world_model.forward_train(
+            observations,
+            actions,
+            is_first,
+        )
+        # Actor.
+        _, distr_params = self.actor(
+            h=results["h_states_BxT"],
+            z=results["z_posterior_states_BxT"],
+        )
+        # Critic.
+        values, _ = self.critic(
+            h=results["h_states_BxT"],
+            z=results["z_posterior_states_BxT"],
+            use_ema=tf.convert_to_tensor(False),
+        )
+        # Dream pipeline.
+        dream_data = self.dream_trajectory(
+            start_states={
+                "h": results["h_states_BxT"],
+                "z": results["z_posterior_states_BxT"],
+            },
+            start_is_terminated=start_is_terminated_BxT,
+        )
+        return {
+            "world_model_fwd": results,
+            "dream_data": dream_data,
+            "actions": actions,
+            "values": values,
+        }
+    @tf.function
+    def forward_inference(self, observations, previous_states, is_first, training=None):
+        """Performs a (non-exploring) action computation step given obs and states.
+        Note that all input data should not have a time rank (only a batch dimension).
+        Args:
+            observations: The current environment observation with shape (B, ...).
+            previous_states: Dict with keys `a`, `h`, and `z` used as input to the RSSM
+                to produce the next h-state, from which then to compute the action
+                using the actor network. All values in the dict should have shape
+                (B, ...) (no time rank).
+            is_first: Batch of is_first flags. These should be True if a new episode
+                has been started at the current timestep (meaning `observations` is the
+                reset observation from the environment).
+        """
+        # Perform one step in the world model (starting from `previous_state` and
+        # using the observations to yield a current (posterior) state).
+        states = self.world_model.forward_inference(
+            observations=observations,
+            previous_states=previous_states,
+            is_first=is_first,
+        )
+        # Compute action using our actor network and the current states.
+        _, distr_params = self.actor(h=states["h"], z=states["z"])
+        # Use the mode of the distribution (Discrete=argmax, Normal=mean).
+        distr = self.actor.get_action_dist_object(distr_params)
+        actions = distr.mode()
+        return actions, {"h": states["h"], "z": states["z"], "a": actions}
+    @tf.function
+    def forward_exploration(
+        self, observations, previous_states, is_first, training=None
+    ):
+        """Performs an exploratory action computation step given obs and states.
+        Note that all input data should not have a time rank (only a batch dimension).
+        Args:
+            observations: The current environment observation with shape (B, ...).
+            previous_states: Dict with keys `a`, `h`, and `z` used as input to the RSSM
+                to produce the next h-state, from which then to compute the action
+                using the actor network. All values in the dict should have shape
+                (B, ...) (no time rank).
+            is_first: Batch of is_first flags. These should be True if a new episode
+                has been started at the current timestep (meaning `observations` is the
+                reset observation from the environment).
+        """
+        # Perform one step in the world model (starting from `previous_state` and
+        # using the observations to yield a current (posterior) state).
+        states = self.world_model.forward_inference(
+            observations=observations,
+            previous_states=previous_states,
+            is_first=is_first,
+        )
+        # Compute action using our actor network and the current states.
+        actions, _ = self.actor(h=states["h"], z=states["z"])
+        return actions, {"h": states["h"], "z": states["z"], "a": actions}
+    def forward_train(self, observations, actions, is_first):
+        """Performs a training forward pass given observations and actions.
+        Note that all input data must have a time rank (batch-major: [B, T, ...]).
+        Args:
+            observations: The environment observations with shape (B, T, ...). Thus,
+                the batch has B rows of T timesteps each. Note that it's ok to have
+                episode boundaries (is_first=True) within a batch row. DreamerV3 will
+                simply insert an initial state before these locations and continue the
+                sequence modelling (with the RSSM). Hence, there will be no zero
+                padding.
+            actions: The actions actually taken in the environment with shape
+                (B, T, ...). See `observations` docstring for details on how B and T are
+                handled.
+            is_first: Batch of is_first flags. These should be True:
+                - if a new episode has been started at the current timestep (meaning
+                `observations` is the reset observation from the environment).
+                - in each batch row at T=0 (first timestep of each of the B batch
+                rows), regardless of whether the actual env had an episode boundary
+                there or not.
+        """
+        return self.world_model.forward_train(
+            observations=observations,
+            actions=actions,
+            is_first=is_first,
+        )
+    @tf.function
+    def get_initial_state(self):
+        """Returns the (current) initial state of the dreamer model (a, h-, z-states).
+        An initial state is generated using the previous action, the tanh of the
+        (learned) h-state variable and the dynamics predictor (or "prior net") to
+        compute z^0 from h0. In this last step, it is important that we do NOT sample
+        the z^-state (as we would usually do during dreaming), but rather take the mode
+        (argmax, then one-hot again).
+        """
+        states = self.world_model.get_initial_state()
+        action_dim = (
+            self.action_space.n
+            if isinstance(self.action_space, gym.spaces.Discrete)
+            else np.prod(self.action_space.shape)
+        )
+        states["a"] = tf.zeros(
+            (
+                1,
+                action_dim,
+            ),
+            dtype=tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32,
+        )
+        return states
+    def dream_trajectory(self, start_states, start_is_terminated):
+        """Dreams trajectories of length H from batch of h- and z-states.
+        Note that incoming data will have the shapes (BxT, ...), where the original
+        batch- and time-dimensions are already folded together. Beginning from this
+        new batch dim (BxT), we will unroll `timesteps_H` timesteps in a time-major
+        fashion, such that the dreamed data will have shape (H, BxT, ...).
+        Args:
+            start_states: Dict of `h` and `z` states in the shape of (B, ...) and
+                (B, num_categoricals, num_classes), respectively, as
+                computed by a train forward pass. From each individual h-/z-state pair
+                in the given batch, we will branch off a dreamed trajectory of len
+                `timesteps_H`.
+            start_is_terminated: Float flags of shape (B,) indicating whether the
+                first timesteps of each batch row is already a terminated timestep
+                (given by the actual environment).
+        """
+        # Dreamed actions (one-hot encoded for discrete actions).
+        a_dreamed_t0_to_H = []
+        a_dreamed_dist_params_t0_to_H = []
+        h = start_states["h"]
+        z = start_states["z"]
+        # GRU outputs.
+        h_states_t0_to_H = [h]
+        # Dynamics model outputs.
+        z_states_prior_t0_to_H = [z]
+        # Compute `a` using actor network (already the first step uses a dreamed action,
+        # not a sampled one).
+        a, a_dist_params = self.actor(
+            # We have to stop the gradients through the states. B/c we are using a
+            # differentiable Discrete action distribution (straight through gradients
+            # with `a = stop_gradient(sample(probs)) + probs - stop_gradient(probs)`,
+            # we otherwise would add dependencies of the `-log(pi(a|s))` REINFORCE loss
+            # term on actions further back in the trajectory.
+            h=tf.stop_gradient(h),
+            z=tf.stop_gradient(z),
+        )
+        a_dreamed_t0_to_H.append(a)
+        a_dreamed_dist_params_t0_to_H.append(a_dist_params)
+        for i in range(self.horizon):
+            # Move one step in the dream using the RSSM.
+            h = self.world_model.sequence_model(a=a, h=h, z=z)
+            h_states_t0_to_H.append(h)
+            # Compute prior z using dynamics model.
+            z, _ = self.world_model.dynamics_predictor(h=h)
+            z_states_prior_t0_to_H.append(z)
+            # Compute `a` using actor network.
+            a, a_dist_params = self.actor(
+                h=tf.stop_gradient(h),
+                z=tf.stop_gradient(z),
+            )
+            a_dreamed_t0_to_H.append(a)
+            a_dreamed_dist_params_t0_to_H.append(a_dist_params)
+        h_states_H_B = tf.stack(h_states_t0_to_H, axis=0)  # (T, B, ...)
+        h_states_HxB = tf.reshape(h_states_H_B, [-1] + h_states_H_B.shape.as_list()[2:])
+        z_states_prior_H_B = tf.stack(z_states_prior_t0_to_H, axis=0)  # (T, B, ...)
+        z_states_prior_HxB = tf.reshape(
+            z_states_prior_H_B, [-1] + z_states_prior_H_B.shape.as_list()[2:]
+        )
+        a_dreamed_H_B = tf.stack(a_dreamed_t0_to_H, axis=0)  # (T, B, ...)
+        a_dreamed_dist_params_H_B = tf.stack(a_dreamed_dist_params_t0_to_H, axis=0)
+        # Compute r using reward predictor.
+        r_dreamed_HxB, _ = self.world_model.reward_predictor(
+            h=h_states_HxB, z=z_states_prior_HxB
+        )
+        r_dreamed_H_B = tf.reshape(
+            inverse_symlog(r_dreamed_HxB), shape=[self.horizon + 1, -1]
+        )
+        # Compute intrinsic rewards.
+        if self.use_curiosity:
+            results_HxB = self.disagree_nets.compute_intrinsic_rewards(
+                h=h_states_HxB,
+                z=z_states_prior_HxB,
+                a=tf.reshape(a_dreamed_H_B, [-1] + a_dreamed_H_B.shape.as_list()[2:]),
+            )
+            # TODO (sven): Wrong? -> Cut out last timestep as we always predict z-states
+            #  for the NEXT timestep and derive ri (for the NEXT timestep) from the
+            #  disagreement between our N disagreee nets.
+            r_intrinsic_H_B = tf.reshape(
+                results_HxB["rewards_intrinsic"], shape=[self.horizon + 1, -1]
+            )[
+                1:
+            ]  # cut out first ts instead
+            curiosity_forward_train_outs = results_HxB["forward_train_outs"]
+            del results_HxB
+        # Compute continues using continue predictor.
+        c_dreamed_HxB, _ = self.world_model.continue_predictor(
+            h=h_states_HxB,
+            z=z_states_prior_HxB,
+        )
+        c_dreamed_H_B = tf.reshape(c_dreamed_HxB, [self.horizon + 1, -1])
+        # Force-set first `continue` flags to False iff `start_is_terminated`.
+        # Note: This will cause the loss-weights for this row in the batch to be
+        # completely zero'd out. In general, we don't use dreamed data past any
+        # predicted (or actual first) continue=False flags.
+        c_dreamed_H_B = tf.concat(
+            [
+                1.0
+                - tf.expand_dims(
+                    tf.cast(start_is_terminated, tf.float32),
+                    0,
+                ),
+                c_dreamed_H_B[1:],
+            ],
+            axis=0,
+        )
+        # Loss weights for each individual dreamed timestep. Zero-out all timesteps
+        # that lie past continue=False flags. B/c our world model does NOT learn how
+        # to skip terminal/reset episode boundaries, dreamed data crossing such a
+        # boundary should not be used for critic/actor learning either.
+        dream_loss_weights_H_B = (
+            tf.math.cumprod(self.gamma * c_dreamed_H_B, axis=0) / self.gamma
+        )
+        # Compute the value estimates.
+        v, v_symlog_dreamed_logits_HxB = self.critic(
+            h=h_states_HxB,
+            z=z_states_prior_HxB,
+            use_ema=False,
+        )
+        v_dreamed_HxB = inverse_symlog(v)
+        v_dreamed_H_B = tf.reshape(v_dreamed_HxB, shape=[self.horizon + 1, -1])
+        v_symlog_dreamed_ema_HxB, _ = self.critic(
+            h=h_states_HxB,
+            z=z_states_prior_HxB,
+            use_ema=True,
+        )
+        v_symlog_dreamed_ema_H_B = tf.reshape(
+            v_symlog_dreamed_ema_HxB, shape=[self.horizon + 1, -1]
+        )
+        ret = {
+            "h_states_t0_to_H_BxT": h_states_H_B,
+            "z_states_prior_t0_to_H_BxT": z_states_prior_H_B,
+            "rewards_dreamed_t0_to_H_BxT": r_dreamed_H_B,
+            "continues_dreamed_t0_to_H_BxT": c_dreamed_H_B,
+            "actions_dreamed_t0_to_H_BxT": a_dreamed_H_B,
+            "actions_dreamed_dist_params_t0_to_H_BxT": a_dreamed_dist_params_H_B,
+            "values_dreamed_t0_to_H_BxT": v_dreamed_H_B,
+            "values_symlog_dreamed_logits_t0_to_HxBxT": v_symlog_dreamed_logits_HxB,
+            "v_symlog_dreamed_ema_t0_to_H_BxT": v_symlog_dreamed_ema_H_B,
+            # Loss weights for critic- and actor losses.
+            "dream_loss_weights_t0_to_H_BxT": dream_loss_weights_H_B,
+        }
+        if self.use_curiosity:
+            ret["rewards_intrinsic_t1_to_H_B"] = r_intrinsic_H_B
+            ret.update(curiosity_forward_train_outs)
+        if isinstance(self.action_space, gym.spaces.Discrete):
+            ret["actions_ints_dreamed_t0_to_H_B"] = tf.argmax(a_dreamed_H_B, axis=-1)
+        return ret
+    def dream_trajectory_with_burn_in(
+        self,
+        *,
+        start_states,
+        timesteps_burn_in: int,
+        timesteps_H: int,
+        observations,  # [B, >=timesteps_burn_in]
+        actions,  # [B, timesteps_burn_in (+timesteps_H)?]
+        use_sampled_actions_in_dream: bool = False,
+        use_random_actions_in_dream: bool = False,
+    ):
+        """Dreams trajectory from N initial observations and initial states.
+        Note: This is only used for reporting and debugging, not for actual world-model
+        or policy training.
+        Args:
+            start_states: The batch of start states (dicts with `a`, `h`, and `z` keys)
+                to begin dreaming with. These are used to compute the first h-state
+                using the sequence model.
+            timesteps_burn_in: For how many timesteps should be use the posterior
+                z-states (computed by the posterior net and actual observations from
+                the env)?
+            timesteps_H: For how many timesteps should we dream using the prior
+                z-states (computed by the dynamics (prior) net and h-states only)?
+                Note that the total length of the returned trajectories will
+                be `timesteps_burn_in` + `timesteps_H`.
+            observations: The batch (B, T, ...) of observations (to be used only during
+                burn-in over `timesteps_burn_in` timesteps).
+            actions: The batch (B, T, ...) of actions to use during a) burn-in over the
+                first `timesteps_burn_in` timesteps and - possibly - b) during
+                actual dreaming, iff use_sampled_actions_in_dream=True.
+                If applicable, actions must already be one-hot'd.
+            use_sampled_actions_in_dream: If True, instead of using our actor network
+                to compute fresh actions, we will use the one provided via the `actions`
+                argument. Note that in the latter case, the `actions` time dimension
+                must be at least `timesteps_burn_in` + `timesteps_H` long.
+            use_random_actions_in_dream: Whether to use randomly sampled actions in the
+                dream. Note that this does not apply to the burn-in phase, during which
+                we will always use the actions given in the `actions` argument.
+        """
+        assert not (use_sampled_actions_in_dream and use_random_actions_in_dream)
+        B = observations.shape[0]
+        # Produce initial N internal posterior states (burn-in) using the given
+        # observations:
+        states = start_states
+        for i in range(timesteps_burn_in):
+            states = self.world_model.forward_inference(
+                observations=observations[:, i],
+                previous_states=states,
+                is_first=tf.fill((B,), 1.0 if i == 0 else 0.0),
+            )
+            states["a"] = actions[:, i]
+        # Start producing the actual dream, using prior states and either the given
+        # actions, dreamed, or random ones.
+        h_states_t0_to_H = [states["h"]]
+        z_states_prior_t0_to_H = [states["z"]]
+        a_t0_to_H = [states["a"]]
+        for j in range(timesteps_H):
+            # Compute next h using sequence model.
+            h = self.world_model.sequence_model(
+                a=states["a"],
+                h=states["h"],
+                z=states["z"],
+            )
+            h_states_t0_to_H.append(h)
+            # Compute z from h, using the dynamics model (we don't have an actual
+            # observation at this timestep).
+            z, _ = self.world_model.dynamics_predictor(h=h)
+            z_states_prior_t0_to_H.append(z)
+            # Compute next dreamed action or use sampled one or random one.
+            if use_sampled_actions_in_dream:
+                a = actions[:, timesteps_burn_in + j]
+            elif use_random_actions_in_dream:
+                if isinstance(self.action_space, gym.spaces.Discrete):
+                    a = tf.random.randint((B,), 0, self.action_space.n, tf.int64)
+                    a = tf.one_hot(
+                        a,
+                        depth=self.action_space.n,
+                        dtype=tf.keras.mixed_precision.global_policy().compute_dtype
+                        or tf.float32,
+                    )
+                # TODO: Support cont. action spaces with bound other than 0.0 and 1.0.
+                else:
+                    a = tf.random.uniform(
+                        shape=(B,) + self.action_space.shape,
+                        dtype=self.action_space.dtype,
+                    )
+            else:
+                a, _ = self.actor(h=h, z=z)
+            a_t0_to_H.append(a)
+            states = {"h": h, "z": z, "a": a}
+        # Fold time-rank for upcoming batch-predictions (no sequences needed anymore).
+        h_states_t0_to_H_B = tf.stack(h_states_t0_to_H, axis=0)
+        h_states_t0_to_HxB = tf.reshape(
+            h_states_t0_to_H_B, shape=[-1] + h_states_t0_to_H_B.shape.as_list()[2:]
+        )
+        z_states_prior_t0_to_H_B = tf.stack(z_states_prior_t0_to_H, axis=0)
+        z_states_prior_t0_to_HxB = tf.reshape(
+            z_states_prior_t0_to_H_B,
+            shape=[-1] + z_states_prior_t0_to_H_B.shape.as_list()[2:],
+        )
+        a_t0_to_H_B = tf.stack(a_t0_to_H, axis=0)
+        # Compute o using decoder.
+        o_dreamed_t0_to_HxB = self.world_model.decoder(
+            h=h_states_t0_to_HxB,
+            z=z_states_prior_t0_to_HxB,
+        )
+        if self.world_model.symlog_obs:
+            o_dreamed_t0_to_HxB = inverse_symlog(o_dreamed_t0_to_HxB)
+        # Compute r using reward predictor.
+        r_dreamed_t0_to_HxB, _ = self.world_model.reward_predictor(
+            h=h_states_t0_to_HxB,
+            z=z_states_prior_t0_to_HxB,
+        )
+        r_dreamed_t0_to_HxB = inverse_symlog(r_dreamed_t0_to_HxB)
+        # Compute continues using continue predictor.
+        c_dreamed_t0_to_HxB, _ = self.world_model.continue_predictor(
+            h=h_states_t0_to_HxB,
+            z=z_states_prior_t0_to_HxB,
+        )
+        # Return everything as time-major (H, B, ...), where H is the timesteps dreamed
+        # (NOT burn-in'd) and B is a batch dimension (this might or might not include
+        # an original time dimension from the real env, from all of which we then branch
+        # out our dream trajectories).
+        ret = {
+            "h_states_t0_to_H_BxT": h_states_t0_to_H_B,
+            "z_states_prior_t0_to_H_BxT": z_states_prior_t0_to_H_B,
+            # Unfold time-ranks in predictions.
+            "observations_dreamed_t0_to_H_BxT": tf.reshape(
+                o_dreamed_t0_to_HxB, [-1, B] + list(observations.shape)[2:]
+            ),
+            "rewards_dreamed_t0_to_H_BxT": tf.reshape(r_dreamed_t0_to_HxB, (-1, B)),
+            "continues_dreamed_t0_to_H_BxT": tf.reshape(c_dreamed_t0_to_HxB, (-1, B)),
+        }
+        # Figure out action key (random, sampled from env, dreamed?).
+        if use_sampled_actions_in_dream:
+            key = "actions_sampled_t0_to_H_BxT"
+        elif use_random_actions_in_dream:
+            key = "actions_random_t0_to_H_BxT"
+        else:
+            key = "actions_dreamed_t0_to_H_BxT"
+        ret[key] = a_t0_to_H_B
+        # Also provide int-actions, if discrete action space.
+        if isinstance(self.action_space, gym.spaces.Discrete):
+            ret[re.sub("^actions_", "actions_ints_", key)] = tf.argmax(
+                a_t0_to_H_B, axis=-1
+            )
+        return ret

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/dreamerv3/tf/models/world_model.py ADDED Viewed

	@@ -0,0 +1,407 @@

+"""
+[1] Mastering Diverse Domains through World Models - 2023
+D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap
+https://arxiv.org/pdf/2301.04104v1.pdf
+"""
+from typing import Optional
+import gymnasium as gym
+import tree  # pip install dm_tree
+from ray.rllib.algorithms.dreamerv3.tf.models.components.continue_predictor import (
+    ContinuePredictor,
+)
+from ray.rllib.algorithms.dreamerv3.tf.models.components.dynamics_predictor import (
+    DynamicsPredictor,
+)
+from ray.rllib.algorithms.dreamerv3.tf.models.components.mlp import MLP
+from ray.rllib.algorithms.dreamerv3.tf.models.components.representation_layer import (
+    RepresentationLayer,
+)
+from ray.rllib.algorithms.dreamerv3.tf.models.components.reward_predictor import (
+    RewardPredictor,
+)
+from ray.rllib.algorithms.dreamerv3.tf.models.components.sequence_model import (
+    SequenceModel,
+)
+from ray.rllib.algorithms.dreamerv3.utils import get_gru_units
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.tf_utils import symlog
+_, tf, _ = try_import_tf()
+class WorldModel(tf.keras.Model):
+    """WorldModel component of [1] w/ encoder, decoder, RSSM, reward/cont. predictors.
+    See eq. 3 of [1] for all components and their respective in- and outputs.
+    Note that in the paper, the "encoder" includes both the raw encoder plus the
+    "posterior net", which produces posterior z-states from observations and h-states.
+    Note: The "internal state" of the world model always consists of:
+    The actions `a` (initially, this is a zeroed-out action), `h`-states (deterministic,
+    continuous), and `z`-states (stochastic, discrete).
+    There are two versions of z-states: "posterior" for world model training and "prior"
+    for creating the dream data.
+    Initial internal state values (`a`, `h`, and `z`) are inserted where ever a new
+    episode starts within a batch row OR at the beginning of each train batch's B rows,
+    regardless of whether there was an actual episode boundary or not. Thus, internal
+    states are not required to be stored in or retrieved from the replay buffer AND
+    retrieved batches from the buffer must not be zero padded.
+    Initial `a` is the zero "one hot" action, e.g. [0.0, 0.0] for Discrete(2), initial
+    `h` is a separate learned variable, and initial `z` are computed by the "dynamics"
+    (or "prior") net, using only the initial-h state as input.
+    """
+    def __init__(
+        self,
+        *,
+        model_size: str = "XS",
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        batch_length_T: int = 64,
+        encoder: tf.keras.Model,
+        decoder: tf.keras.Model,
+        num_gru_units: Optional[int] = None,
+        symlog_obs: bool = True,
+    ):
+        """Initializes a WorldModel instance.
+        Args:
+             model_size: The "Model Size" used according to [1] Appendinx B.
+                Use None for manually setting the different network sizes.
+             observation_space: The observation space of the environment used.
+             action_space: The action space of the environment used.
+             batch_length_T: The length (T) of the sequences used for training. The
+                actual shape of the input data (e.g. rewards) is then: [B, T, ...],
+                where B is the "batch size", T is the "batch length" (this arg) and
+                "..." is the dimension of the data (e.g. (64, 64, 3) for Atari image
+                observations). Note that a single row (within a batch) may contain data
+                from different episodes, but an already on-going episode is always
+                finished, before a new one starts within the same row.
+            encoder: The encoder Model taking observations as inputs and
+                outputting a 1D latent vector that will be used as input into the
+                posterior net (z-posterior state generating layer). Inputs are symlogged
+                if inputs are NOT images. For images, we use normalization between -1.0
+                and 1.0 (x / 128 - 1.0)
+            decoder: The decoder Model taking h- and z-states as inputs and generating
+                a (possibly symlogged) predicted observation. Note that for images,
+                the last decoder layer produces the exact, normalized pixel values
+                (not a Gaussian as described in [1]!).
+            num_gru_units: The number of GRU units to use. If None, use
+                `model_size` to figure out this parameter.
+            symlog_obs: Whether to predict decoded observations in symlog space.
+                This should be False for image based observations.
+                According to the paper [1] Appendix E: "NoObsSymlog: This ablation
+                removes the symlog encoding of inputs to the world model and also
+                changes the symlog MSE loss in the decoder to a simple MSE loss.
+                *Because symlog encoding is only used for vector observations*, this
+                ablation is equivalent to DreamerV3 on purely image-based environments".
+        """
+        super().__init__(name="world_model")
+        self.model_size = model_size
+        self.batch_length_T = batch_length_T
+        self.symlog_obs = symlog_obs
+        self.observation_space = observation_space
+        self.action_space = action_space
+        self._comp_dtype = (
+            tf.keras.mixed_precision.global_policy().compute_dtype or tf.float32
+        )
+        # Encoder (latent 1D vector generator) (xt -> lt).
+        self.encoder = encoder
+        # Posterior predictor consisting of an MLP and a RepresentationLayer:
+        # [ht, lt] -> zt.
+        self.posterior_mlp = MLP(
+            model_size=self.model_size,
+            output_layer_size=None,
+            # In Danijar's code, the posterior predictor only has a single layer,
+            # no matter the model size:
+            num_dense_layers=1,
+            name="posterior_mlp",
+        )
+        # The (posterior) z-state generating layer.
+        self.posterior_representation_layer = RepresentationLayer(
+            model_size=self.model_size,
+        )
+        # Dynamics (prior z-state) predictor: ht -> z^t
+        self.dynamics_predictor = DynamicsPredictor(model_size=self.model_size)
+        # GRU for the RSSM: [at, ht, zt] -> ht+1
+        self.num_gru_units = get_gru_units(
+            model_size=self.model_size,
+            override=num_gru_units,
+        )
+        # Initial h-state variable (learnt).
+        # -> tanh(self.initial_h) -> deterministic state
+        # Use our Dynamics predictor for initial stochastic state, BUT with greedy
+        # (mode) instead of sampling.
+        self.initial_h = tf.Variable(
+            tf.zeros(shape=(self.num_gru_units,)),
+            trainable=True,
+            name="initial_h",
+        )
+        # The actual sequence model containing the GRU layer.
+        self.sequence_model = SequenceModel(
+            model_size=self.model_size,
+            action_space=self.action_space,
+            num_gru_units=self.num_gru_units,
+        )
+        # Reward Predictor: [ht, zt] -> rt.
+        self.reward_predictor = RewardPredictor(model_size=self.model_size)
+        # Continue Predictor: [ht, zt] -> ct.
+        self.continue_predictor = ContinuePredictor(model_size=self.model_size)
+        # Decoder: [ht, zt] -> x^t.
+        self.decoder = decoder
+        # Trace self.call.
+        self.forward_train = tf.function(
+            input_signature=[
+                tf.TensorSpec(shape=[None, None] + list(self.observation_space.shape)),
+                tf.TensorSpec(
+                    shape=[None, None]
+                    + (
+                        [self.action_space.n]
+                        if isinstance(action_space, gym.spaces.Discrete)
+                        else list(self.action_space.shape)
+                    )
+                ),
+                tf.TensorSpec(shape=[None, None], dtype=tf.bool),
+            ]
+        )(self.forward_train)
+    @tf.function
+    def get_initial_state(self):
+        """Returns the (current) initial state of the world model (h- and z-states).
+        An initial state is generated using the tanh of the (learned) h-state variable
+        and the dynamics predictor (or "prior net") to compute z^0 from h0. In this last
+        step, it is important that we do NOT sample the z^-state (as we would usually
+        do during dreaming), but rather take the mode (argmax, then one-hot again).
+        """
+        h = tf.expand_dims(tf.math.tanh(tf.cast(self.initial_h, self._comp_dtype)), 0)
+        # Use the mode, NOT a sample for the initial z-state.
+        _, z_probs = self.dynamics_predictor(h)
+        z = tf.argmax(z_probs, axis=-1)
+        z = tf.one_hot(z, depth=z_probs.shape[-1], dtype=self._comp_dtype)
+        return {"h": h, "z": z}
+    def forward_inference(self, observations, previous_states, is_first, training=None):
+        """Performs a forward step for inference (e.g. environment stepping).
+        Works analogous to `forward_train`, except that all inputs are provided
+        for a single timestep in the shape of [B, ...] (no time dimension!).
+        Args:
+            observations: The batch (B, ...) of observations to be passed through
+                the encoder network to yield the inputs to the representation layer
+                (which then can compute the z-states).
+            previous_states: A dict with `h`, `z`, and `a` keys mapping to the
+                respective previous states/actions. All of the shape (B, ...), no time
+                rank.
+            is_first: The batch (B) of `is_first` flags.
+        Returns:
+            The next deterministic h-state (h(t+1)) as predicted by the sequence model.
+        """
+        observations = tf.cast(observations, self._comp_dtype)
+        initial_states = tree.map_structure(
+            lambda s: tf.repeat(s, tf.shape(observations)[0], axis=0),
+            self.get_initial_state(),
+        )
+        # If first, mask it with initial state/actions.
+        previous_h = self._mask(previous_states["h"], 1.0 - is_first)  # zero out
+        previous_h = previous_h + self._mask(initial_states["h"], is_first)  # add init
+        previous_z = self._mask(previous_states["z"], 1.0 - is_first)  # zero out
+        previous_z = previous_z + self._mask(initial_states["z"], is_first)  # add init
+        # Zero out actions (no special learnt initial state).
+        previous_a = self._mask(previous_states["a"], 1.0 - is_first)
+        # Compute new states.
+        h = self.sequence_model(a=previous_a, h=previous_h, z=previous_z)
+        z = self.compute_posterior_z(observations=observations, initial_h=h)
+        return {"h": h, "z": z}
+    def forward_train(self, observations, actions, is_first):
+        """Performs a forward step for training.
+        1) Forwards all observations [B, T, ...] through the encoder network to yield
+        o_processed[B, T, ...].
+        2) Uses initial state (h0/z^0/a0[B, 0, ...]) and sequence model (RSSM) to
+        compute the first internal state (h1 and z^1).
+        3) Uses action a[B, 1, ...], z[B, 1, ...] and h[B, 1, ...] to compute the
+        next h-state (h[B, 2, ...]), etc..
+        4) Repeats 2) and 3) until t=T.
+        5) Uses all h[B, T, ...] and z[B, T, ...] to compute predicted/reconstructed
+        observations, rewards, and continue signals.
+        6) Returns predictions from 5) along with all z-states z[B, T, ...] and
+        the final h-state (h[B, ...] for t=T).
+        Should we encounter is_first=True flags in the middle of a batch row (somewhere
+        within an ongoing sequence of length T), we insert this world model's initial
+        state again (zero-action, learned init h-state, and prior-computed z^) and
+        simply continue (no zero-padding).
+        Args:
+            observations: The batch (B, T, ...) of observations to be passed through
+                the encoder network to yield the inputs to the representation layer
+                (which then can compute the posterior z-states).
+            actions: The batch (B, T, ...) of actions to be used in combination with
+                h-states and computed z-states to yield the next h-states.
+            is_first: The batch (B, T) of `is_first` flags.
+        """
+        if self.symlog_obs:
+            observations = symlog(observations)
+        # Compute bare encoder outs (not z; this is done later with involvement of the
+        # sequence model and the h-states).
+        # Fold time dimension for CNN pass.
+        shape = tf.shape(observations)
+        B, T = shape[0], shape[1]
+        observations = tf.reshape(
+            observations, shape=tf.concat([[-1], shape[2:]], axis=0)
+        )
+        encoder_out = self.encoder(tf.cast(observations, self._comp_dtype))
+        # Unfold time dimension.
+        encoder_out = tf.reshape(
+            encoder_out, shape=tf.concat([[B, T], tf.shape(encoder_out)[1:]], axis=0)
+        )
+        # Make time major for faster upcoming loop.
+        encoder_out = tf.transpose(
+            encoder_out, perm=[1, 0] + list(range(2, len(encoder_out.shape.as_list())))
+        )
+        # encoder_out=[T, B, ...]
+        initial_states = tree.map_structure(
+            lambda s: tf.repeat(s, B, axis=0), self.get_initial_state()
+        )
+        # Make actions and `is_first` time-major.
+        actions = tf.transpose(
+            tf.cast(actions, self._comp_dtype),
+            perm=[1, 0] + list(range(2, tf.shape(actions).shape.as_list()[0])),
+        )
+        is_first = tf.transpose(tf.cast(is_first, self._comp_dtype), perm=[1, 0])
+        # Loop through the T-axis of our samples and perform one computation step at
+        # a time. This is necessary because the sequence model's output (h(t+1)) depends
+        # on the current z(t), but z(t) depends on the current sequence model's output
+        # h(t).
+        z_t0_to_T = [initial_states["z"]]
+        z_posterior_probs = []
+        z_prior_probs = []
+        h_t0_to_T = [initial_states["h"]]
+        for t in range(self.batch_length_T):
+            # If first, mask it with initial state/actions.
+            h_tm1 = self._mask(h_t0_to_T[-1], 1.0 - is_first[t])  # zero out
+            h_tm1 = h_tm1 + self._mask(initial_states["h"], is_first[t])  # add init
+            z_tm1 = self._mask(z_t0_to_T[-1], 1.0 - is_first[t])  # zero out
+            z_tm1 = z_tm1 + self._mask(initial_states["z"], is_first[t])  # add init
+            # Zero out actions (no special learnt initial state).
+            a_tm1 = self._mask(actions[t - 1], 1.0 - is_first[t])
+            # Perform one RSSM (sequence model) step to get the current h.
+            h_t = self.sequence_model(a=a_tm1, h=h_tm1, z=z_tm1)
+            h_t0_to_T.append(h_t)
+            posterior_mlp_input = tf.concat([encoder_out[t], h_t], axis=-1)
+            repr_input = self.posterior_mlp(posterior_mlp_input)
+            # Draw one z-sample (z(t)) and also get the z-distribution for dynamics and
+            # representation loss computations.
+            z_t, z_probs = self.posterior_representation_layer(repr_input)
+            # z_t=[B, num_categoricals, num_classes]
+            z_posterior_probs.append(z_probs)
+            z_t0_to_T.append(z_t)
+            # Compute the predicted z_t (z^) using the dynamics model.
+            _, z_probs = self.dynamics_predictor(h_t)
+            z_prior_probs.append(z_probs)
+        # Stack at time dimension to yield: [B, T, ...].
+        h_t1_to_T = tf.stack(h_t0_to_T[1:], axis=1)
+        z_t1_to_T = tf.stack(z_t0_to_T[1:], axis=1)
+        # Fold time axis to retrieve the final (loss ready) Independent distribution
+        # (over `num_categoricals` Categoricals).
+        z_posterior_probs = tf.stack(z_posterior_probs, axis=1)
+        z_posterior_probs = tf.reshape(
+            z_posterior_probs,
+            shape=[-1] + z_posterior_probs.shape.as_list()[2:],
+        )
+        # Fold time axis to retrieve the final (loss ready) Independent distribution
+        # (over `num_categoricals` Categoricals).
+        z_prior_probs = tf.stack(z_prior_probs, axis=1)
+        z_prior_probs = tf.reshape(
+            z_prior_probs,
+            shape=[-1] + z_prior_probs.shape.as_list()[2:],
+        )
+        # Fold time dimension for parallelization of all dependent predictions:
+        # observations (reproduction via decoder), rewards, continues.
+        h_BxT = tf.reshape(h_t1_to_T, shape=[-1] + h_t1_to_T.shape.as_list()[2:])
+        z_BxT = tf.reshape(z_t1_to_T, shape=[-1] + z_t1_to_T.shape.as_list()[2:])
+        obs_distribution_means = tf.cast(self.decoder(h=h_BxT, z=z_BxT), tf.float32)
+        # Compute (predicted) reward distributions.
+        rewards, reward_logits = self.reward_predictor(h=h_BxT, z=z_BxT)
+        # Compute (predicted) continue distributions.
+        continues, continue_distribution = self.continue_predictor(h=h_BxT, z=z_BxT)
+        # Return outputs for loss computation.
+        # Note that all shapes are [BxT, ...] (time axis already folded).
+        return {
+            # Obs.
+            "sampled_obs_symlog_BxT": observations,
+            "obs_distribution_means_BxT": obs_distribution_means,
+            # Rewards.
+            "reward_logits_BxT": reward_logits,
+            "rewards_BxT": rewards,
+            # Continues.
+            "continue_distribution_BxT": continue_distribution,
+            "continues_BxT": continues,
+            # Deterministic, continuous h-states (t1 to T).
+            "h_states_BxT": h_BxT,
+            # Sampled, discrete posterior z-states and their probs (t1 to T).
+            "z_posterior_states_BxT": z_BxT,
+            "z_posterior_probs_BxT": z_posterior_probs,
+            # Probs of the prior z-states (t1 to T).
+            "z_prior_probs_BxT": z_prior_probs,
+        }
+    def compute_posterior_z(self, observations, initial_h):
+        # Compute bare encoder outputs (not including z, which is computed in next step
+        # with involvement of the previous output (initial_h) of the sequence model).
+        # encoder_outs=[B, ...]
+        if self.symlog_obs:
+            observations = symlog(observations)
+        encoder_out = self.encoder(observations)
+        # Concat encoder outs with the h-states.
+        posterior_mlp_input = tf.concat([encoder_out, initial_h], axis=-1)
+        # Compute z.
+        repr_input = self.posterior_mlp(posterior_mlp_input)
+        # Draw a z-sample.
+        z_t, _ = self.posterior_representation_layer(repr_input)
+        return z_t
+    @staticmethod
+    def _mask(value, mask):
+        return tf.einsum("b...,b->b...", value, tf.cast(mask, value.dtype))

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from ray.rllib.algorithms.ppo.ppo import PPOConfig, PPO
+from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy, PPOTF2Policy
+from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy
+__all__ = [
+    "PPO",
+    "PPOConfig",
+    # @OldAPIStack
+    "PPOTF1Policy",
+    "PPOTF2Policy",
+    "PPOTorchPolicy",
+]

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (568 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__pycache__/default_ppo_rl_module.cpython-311.pyc ADDED Viewed

Binary file (3.41 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__pycache__/ppo.cpython-311.pyc ADDED Viewed

Binary file (23.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/ppo/__pycache__/ppo_catalog.cpython-311.pyc ADDED Viewed

Binary file (8.73 kB). View file