koichi12 commited on Feb 12, 2025

Commit

28dd79d

verified ·

1 Parent(s): 4827d37

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/vpg_custom_algorithm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/vpg.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/vpg.py +176 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/vpg_custom_algorithm.py +117 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/continue_training_from_checkpoint.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_tf.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch_lstm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/cartpole_dqn_export.py +77 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/change_config_during_training.py +246 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py +146 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/continue_training_from_checkpoint.py +268 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_tf.py +91 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch.py +79 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch_lstm.py +136 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py +171 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/count_based_curiosity.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/intrinsic_curiosity_model_based_curiosity.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/count_based_curiosity.py +137 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/euclidian_distance_based_curiosity.py +127 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py +313 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_experiment.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_logger.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_progress_reporter.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.py +183 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_logger.py +137 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_progress_reporter.py +119 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/action_masking_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_cnn_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_lstm_rl_module.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/pretraining_single_agent_training_multi_agent.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/action_masking_rl_module.py +127 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__init__.py +10 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/__init__.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -177,3 +177,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c834df6257a1158af124427e411086f3fcc8eb2ea4c080f29143c4a418c67c
+size 250369

.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/vpg_custom_algorithm.cpython-311.pyc ADDED Viewed

Binary file (4.87 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (210 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/vpg.cpython-311.pyc ADDED Viewed

Binary file (9.03 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/vpg.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import tree  # pip install dm_tree
+from ray.rllib.algorithms import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    ENV_RUNNER_SAMPLING_TIMER,
+    LEARNER_RESULTS,
+    LEARNER_UPDATE_TIMER,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+    SYNCH_WORKER_WEIGHTS_TIMER,
+    TIMERS,
+)
+class VPGConfig(AlgorithmConfig):
+    """A simple VPG (vanilla policy gradient) algorithm w/o value function support.
+    Use for testing purposes only!
+    This Algorithm should use the VPGTorchLearner and VPGTorchRLModule
+    """
+    # A test setting to activate metrics on mean weights.
+    report_mean_weights: bool = True
+    def __init__(self, algo_class=None):
+        super().__init__(algo_class=algo_class or VPG)
+        # VPG specific settings.
+        self.num_episodes_per_train_batch = 10
+        # Note that we don't have to set this here, because we tell the EnvRunners
+        # explicitly to sample entire episodes. However, for good measure, we change
+        # this setting here either way.
+        self.batch_mode = "complete_episodes"
+        # VPG specific defaults (from AlgorithmConfig).
+        self.num_env_runners = 1
+    @override(AlgorithmConfig)
+    def training(
+        self, *, num_episodes_per_train_batch=NotProvided, **kwargs
+    ) -> "VPGConfig":
+        """Sets the training related configuration.
+        Args:
+            num_episodes_per_train_batch: The number of complete episodes per train
+                batch. VPG requires entire episodes to be sampled from the EnvRunners.
+                For environments with varying episode lengths, this leads to varying
+                batch sizes (in timesteps) as well possibly causing slight learning
+                instabilities. However, for simplicity reasons, we stick to collecting
+                always exactly n episodes per training update.
+        Returns:
+            This updated AlgorithmConfig object.
+        """
+        # Pass kwargs onto super's `training()` method.
+        super().training(**kwargs)
+        if num_episodes_per_train_batch is not NotProvided:
+            self.num_episodes_per_train_batch = num_episodes_per_train_batch
+        return self
+    @override(AlgorithmConfig)
+    def get_default_rl_module_spec(self):
+        if self.framework_str == "torch":
+            from ray.rllib.examples.rl_modules.classes.vpg_torch_rlm import (
+                VPGTorchRLModule,
+            )
+            spec = RLModuleSpec(
+                module_class=VPGTorchRLModule,
+                model_config={"hidden_dim": 64},
+            )
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework_str}")
+        return spec
+    @override(AlgorithmConfig)
+    def get_default_learner_class(self):
+        if self.framework_str == "torch":
+            from ray.rllib.examples.learners.classes.vpg_torch_learner import (
+                VPGTorchLearner,
+            )
+            return VPGTorchLearner
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework_str}")
+class VPG(Algorithm):
+    @classmethod
+    @override(Algorithm)
+    def get_default_config(cls) -> AlgorithmConfig:
+        return VPGConfig()
+    @override(Algorithm)
+    def training_step(self) -> None:
+        """Override of the training_step method of `Algorithm`.
+        Runs the following steps per call:
+        - Sample B timesteps (B=train batch size). Note that we don't sample complete
+        episodes due to simplicity. For an actual VPG algo, due to the loss computation,
+        you should always sample only completed episodes.
+        - Send the collected episodes to the VPG LearnerGroup for model updating.
+        - Sync the weights from LearnerGroup to all EnvRunners.
+        """
+        # Sample.
+        with self.metrics.log_time((TIMERS, ENV_RUNNER_SAMPLING_TIMER)):
+            episodes, env_runner_results = self._sample_episodes()
+        # Merge results from n parallel sample calls into self's metrics logger.
+        self.metrics.merge_and_log_n_dicts(env_runner_results, key=ENV_RUNNER_RESULTS)
+        # Just for demonstration purposes, log the number of time steps sampled in this
+        # `training_step` round.
+        # Mean over a window of 100:
+        self.metrics.log_value(
+            "episode_timesteps_sampled_mean_win100",
+            sum(map(len, episodes)),
+            reduce="mean",
+            window=100,
+        )
+        # Exponential Moving Average (EMA) with coeff=0.1:
+        self.metrics.log_value(
+            "episode_timesteps_sampled_ema",
+            sum(map(len, episodes)),
+            ema_coeff=0.1,  # <- weight of new value; weight of old avg=1.0-ema_coeff
+        )
+        # Update model.
+        with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)):
+            learner_results = self.learner_group.update_from_episodes(
+                episodes=episodes,
+                timesteps={
+                    NUM_ENV_STEPS_SAMPLED_LIFETIME: (
+                        self.metrics.peek(
+                            (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME)
+                        )
+                    ),
+                },
+            )
+        # Merge results from m parallel update calls into self's metrics logger.
+        self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS)
+        # Sync weights.
+        with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)):
+            self.env_runner_group.sync_weights(
+                from_worker_or_learner_group=self.learner_group,
+                inference_only=True,
+            )
+    def _sample_episodes(self):
+        # How many episodes to sample from each EnvRunner?
+        num_episodes_per_env_runner = self.config.num_episodes_per_train_batch // (
+            self.config.num_env_runners or 1
+        )
+        # Send parallel remote requests to sample and get the metrics.
+        sampled_data = self.env_runner_group.foreach_env_runner(
+            # Return tuple of [episodes], [metrics] from each EnvRunner.
+            lambda env_runner: (
+                env_runner.sample(num_episodes=num_episodes_per_env_runner),
+                env_runner.get_metrics(),
+            ),
+            # Loop over remote EnvRunners' `sample()` method in parallel or use the
+            # local EnvRunner if there aren't any remote ones.
+            local_env_runner=self.env_runner_group.num_remote_workers() <= 0,
+        )
+        # Return one list of episodes and a list of metrics dicts (one per EnvRunner).
+        episodes = tree.flatten([s[0] for s in sampled_data])
+        stats_dicts = [s[1] for s in sampled_data]
+        return episodes, stats_dicts

.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/vpg_custom_algorithm.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Example of how to write a custom Algorithm.
+This is an end-to-end example for how to implement a custom Algorithm, including
+a matching AlgorithmConfig class and Learner class. There is no particular RLModule API
+needed for this algorithm, which means that any TorchRLModule returning actions
+or action distribution parameters suffices.
+The RK algorithm implemented here is "vanilla policy gradient" (VPG) in its simplest
+form, without a value function baseline.
+See the actual VPG algorithm class here:
+https://github.com/ray-project/ray/blob/master/rllib/examples/algorithms/classes/vpg.py
+The Learner class the algorithm uses by default (if the user doesn't specify a custom
+Learner):
+https://github.com/ray-project/ray/blob/master/rllib/examples/learners/classes/vpg_torch_learner.py  # noqa
+And the RLModule class the algorithm uses by default (if the user doesn't specify a
+custom RLModule):
+https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/classes/vpg_torch_rlm.py  # noqa
+This example shows:
+    - how to subclass the AlgorithmConfig base class to implement a custom algorithm's.
+    config class.
+    - how to subclass the Algorithm base class to implement a custom Algorithm,
+    including its `training_step` method.
+    - how to subclass the TorchLearner base class to implement a custom Learner with
+    loss function, overriding `compute_loss_for_module` and
+    `after_gradient_based_update`.
+    - how to define a default RLModule used by the algorithm in case the user
+    doesn't bring their own custom RLModule. The VPG algorithm doesn't require any
+    specific RLModule APIs, so any RLModule returning actions or action distribution
+    inputs suffices.
+We compute a plain policy gradient loss without value function baseline.
+The experiment shows that even with such a simple setup, our custom algorithm is still
+able to successfully learn CartPole-v1.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+With some fine-tuning of the learning rate, the batch size, and maybe the
+number of env runners and number of envs per env runner, you should see decent
+learning behavior on the CartPole-v1 environment:
++-----------------------------+------------+--------+------------------+
+| Trial name                  | status     |   iter |   total time (s) |
+|                             |            |        |                  |
+|-----------------------------+------------+--------+------------------+
+| VPG_CartPole-v1_2973e_00000 | TERMINATED |    451 |          59.5184 |
++-----------------------------+------------+--------+------------------+
++-----------------------+------------------------+------------------------+
+|   episode_return_mean |   num_env_steps_sample |   ...env_steps_sampled |
+|                       |             d_lifetime |   _lifetime_throughput |
+|-----------------------+------------------------+------------------------|
+|                250.52 |                 415787 |                7428.98 |
++-----------------------+------------------------+------------------------+
+"""
+from ray.rllib.examples.algorithms.classes.vpg import VPGConfig
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+parser = add_rllib_example_script_args(
+    default_reward=250.0,
+    default_iters=1000,
+    default_timesteps=750000,
+)
+parser.set_defaults(enable_new_api_stack=True)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    base_config = (
+        VPGConfig()
+        .environment("CartPole-v1")
+        .training(
+            # The only VPG-specific setting. How many episodes per train batch?
+            num_episodes_per_train_batch=10,
+            # Set other config parameters.
+            lr=0.0005,
+            # Note that you don't have to set any specific Learner class, because
+            # our custom Algorithm already defines the default Learner class to use
+            # through its `get_default_learner_class` method, which returns
+            # `VPGTorchLearner`.
+            # learner_class=VPGTorchLearner,
+        )
+        # Increase the number of EnvRunners (default is 1 for VPG)
+        # or the number of envs per EnvRunner.
+        .env_runners(num_env_runners=2, num_envs_per_env_runner=1)
+        # Plug in your own RLModule class. VPG doesn't require any specific
+        # RLModule APIs, so any RLModule returning `actions` or `action_dist_inputs`
+        # from the forward methods works ok.
+        # .rl_module(
+        #    rl_module_spec=RLModuleSpec(module_class=...),
+        # )
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/continue_training_from_checkpoint.cpython-311.pyc ADDED Viewed

Binary file (13.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_tf.cpython-311.pyc ADDED Viewed

Binary file (3.67 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch.cpython-311.pyc ADDED Viewed

Binary file (3.04 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch_lstm.cpython-311.pyc ADDED Viewed

Binary file (5.85 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/cartpole_dqn_export.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/usr/bin/env python
+# @OldAPIStack
+import numpy as np
+import os
+import ray
+from ray.rllib.policy.policy import Policy
+from ray.rllib.utils.framework import try_import_tf
+from ray.tune.registry import get_trainable_cls
+tf1, tf, tfv = try_import_tf()
+ray.init()
+def train_and_export_policy_and_model(algo_name, num_steps, model_dir, ckpt_dir):
+    cls = get_trainable_cls(algo_name)
+    config = cls.get_default_config()
+    config.api_stack(
+        enable_rl_module_and_learner=False,
+        enable_env_runner_and_connector_v2=False,
+    )
+    # This Example is only for tf.
+    config.framework("tf")
+    # Set exporting native (DL-framework) model files to True.
+    config.export_native_model_files = True
+    config.env = "CartPole-v1"
+    alg = config.build()
+    for _ in range(num_steps):
+        alg.train()
+    # Export Policy checkpoint.
+    alg.export_policy_checkpoint(ckpt_dir)
+    # Export tensorflow keras Model for online serving
+    alg.export_policy_model(model_dir)
+def restore_saved_model(export_dir):
+    signature_key = (
+        tf1.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    )
+    g = tf1.Graph()
+    with g.as_default():
+        with tf1.Session(graph=g) as sess:
+            meta_graph_def = tf1.saved_model.load(
+                sess, [tf1.saved_model.tag_constants.SERVING], export_dir
+            )
+            print("Model restored!")
+            print("Signature Def Information:")
+            print(meta_graph_def.signature_def[signature_key])
+            print("You can inspect the model using TensorFlow SavedModel CLI.")
+            print("https://www.tensorflow.org/guide/saved_model")
+def restore_policy_from_checkpoint(export_dir):
+    # Load the model from the checkpoint.
+    policy = Policy.from_checkpoint(export_dir)
+    # Perform a dummy (CartPole) forward pass.
+    test_obs = np.array([0.1, 0.2, 0.3, 0.4])
+    results = policy.compute_single_action(test_obs)
+    # Check results for correctness.
+    assert len(results) == 3
+    assert results[0].shape == ()  # pure single action (int)
+    assert results[1] == []  # RNN states
+    assert results[2]["action_dist_inputs"].shape == (2,)  # categorical inputs
+if __name__ == "__main__":
+    algo = "PPO"
+    model_dir = os.path.join(ray._private.utils.get_user_temp_dir(), "model_export_dir")
+    ckpt_dir = os.path.join(ray._private.utils.get_user_temp_dir(), "ckpt_export_dir")
+    num_steps = 1
+    train_and_export_policy_and_model(algo, num_steps, model_dir, ckpt_dir)
+    restore_saved_model(model_dir)
+    restore_policy_from_checkpoint(ckpt_dir)

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/change_config_during_training.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""Example showing how to continue training an Algorithm with a changed config.
+Use the setup shown in this script if you want to continue a prior experiment, but
+would also like to change some of the config values you originally used.
+This example:
+    - runs a single- or multi-agent CartPole experiment (for multi-agent, we use
+    different learning rates) thereby checkpointing the state of the Algorithm every n
+    iterations. The config used is hereafter called "1st config".
+    - stops the experiment due to some episode return being achieved.
+    - just for testing purposes, restores the entire algorithm from the latest
+    checkpoint and checks, whether the state of the restored algo exactly match the
+    state of the previously saved one.
+    - then changes the original config used (learning rate and other settings) and
+    continues training with the restored algorithm and the changed config until a
+    final episode return is reached. The new config is hereafter called "2nd config".
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=[0 or 2]
+--stop-reward-first-config=[return at which the algo on 1st config should stop training]
+--stop-reward=[the final return to achieve after restoration from the checkpoint with
+the 2nd config]
+`
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+First, you should see the initial tune.Tuner do it's thing:
+Trial status: 1 RUNNING
+Current time: 2024-06-03 12:03:39. Total running time: 30s
+Logical resource usage: 3.0/12 CPUs, 0/0 GPUs
+╭────────────────────────────────────────────────────────────────────────
+│ Trial name                    status       iter     total time (s)
+├────────────────────────────────────────────────────────────────────────
+│ PPO_CartPole-v1_7b1eb_00000   RUNNING         6             16.265
+╰────────────────────────────────────────────────────────────────────────
+───────────────────────────────────────────────────────────────────────╮
+..._sampled_lifetime     ..._trained_lifetime     ...episodes_lifetime │
+───────────────────────────────────────────────────────────────────────┤
+               24000                    24000                      340 │
+───────────────────────────────────────────────────────────────────────╯
+...
+The experiment stops at an average episode return of `--stop-reward-first-config`.
+After the validation of the last checkpoint, a new experiment is started from
+scratch, but with the RLlib callback restoring the Algorithm right after
+initialization using the previous checkpoint. This new experiment then runs
+until `--stop-reward` is reached.
+Trial status: 1 RUNNING
+Current time: 2024-06-03 12:05:00. Total running time: 1min 0s
+Logical resource usage: 3.0/12 CPUs, 0/0 GPUs
+╭────────────────────────────────────────────────────────────────────────
+│ Trial name                    status       iter     total time (s)
+├────────────────────────────────────────────────────────────────────────
+│ PPO_CartPole-v1_7b1eb_00000   RUNNING        23            14.8372
+╰────────────────────────────────────────────────────────────────────────
+─────────────────────────────────────────────────────────��─────────────╮
+..._sampled_lifetime     ..._trained_lifetime     ...episodes_lifetime │
+───────────────────────────────────────────────────────────────────────┤
+              109078                   109078                      531 │
+───────────────────────────────────────────────────────────────────────╯
+And if you are using the `--as-test` option, you should see a finel message:
+```
+`env_runners/episode_return_mean` of 450.0 reached! ok
+```
+"""
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    LEARNER_RESULTS,
+)
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    check,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import register_env
+parser = add_rllib_example_script_args(
+    default_reward=450.0, default_timesteps=10000000, default_iters=2000
+)
+parser.add_argument(
+    "--stop-reward-first-config",
+    type=float,
+    default=150.0,
+    help="Mean episode return after which the Algorithm on the first config should "
+    "stop training.",
+)
+# By default, set `args.checkpoint_freq` to 1 and `args.checkpoint_at_end` to True.
+parser.set_defaults(
+    enable_new_api_stack=True,
+    checkpoint_freq=1,
+    checkpoint_at_end=True,
+)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    register_env(
+        "ma_cart", lambda cfg: MultiAgentCartPole({"num_agents": args.num_agents})
+    )
+    # Simple generic config.
+    base_config = (
+        PPOConfig()
+        .environment("CartPole-v1" if args.num_agents == 0 else "ma_cart")
+        .training(lr=0.0001)
+        # TODO (sven): Tune throws a weird error inside the "log json" callback
+        #  when running with this option. The `perf` key in the result dict contains
+        #  binary data (instead of just 2 float values for mem and cpu usage).
+        # .experimental(_use_msgpack_checkpoints=True)
+    )
+    # Setup multi-agent, if required.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={
+                f"p{aid}": PolicySpec(
+                    config=AlgorithmConfig.overrides(
+                        lr=5e-5
+                        * (aid + 1),  # agent 1 has double the learning rate as 0.
+                    )
+                )
+                for aid in range(args.num_agents)
+            },
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+    # Define some stopping criterion. Note that this criterion is an avg episode return
+    # to be reached.
+    metric = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    stop = {metric: args.stop_reward_first_config}
+    tuner_results = run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        keep_ray_up=True,
+    )
+    # Perform a very quick test to make sure our algo (upon restoration) did not lose
+    # its ability to perform well in the env.
+    # - Extract the best checkpoint.
+    best_result = tuner_results.get_best_result(metric=metric, mode="max")
+    assert (
+        best_result.metrics[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_first_config
+    )
+    best_checkpoint_path = best_result.checkpoint.path
+    # Rebuild the algorithm (just for testing purposes).
+    test_algo = base_config.build()
+    # Load algo's state from the best checkpoint.
+    test_algo.restore_from_path(best_checkpoint_path)
+    # Perform some checks on the restored state.
+    assert test_algo.training_iteration > 0
+    # Evaluate on the restored algorithm.
+    test_eval_results = test_algo.evaluate()
+    assert (
+        test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_first_config
+    ), test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    # Train one iteration to make sure, the performance does not collapse (e.g. due
+    # to the optimizer weights not having been restored properly).
+    test_results = test_algo.train()
+    assert (
+        test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_first_config
+    ), test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    # Stop the test algorithm again.
+    test_algo.stop()
+    # Make sure the algorithm gets restored from a checkpoint right after
+    # initialization. Note that this includes all subcomponents of the algorithm,
+    # including the optimizer states in the LearnerGroup/Learner actors.
+    def on_algorithm_init(algorithm, **kwargs):
+        module_p0 = algorithm.get_module("p0")
+        weight_before = convert_to_numpy(next(iter(module_p0.parameters())))
+        algorithm.restore_from_path(best_checkpoint_path)
+        # Make sure weights were restored (changed).
+        weight_after = convert_to_numpy(next(iter(module_p0.parameters())))
+        check(weight_before, weight_after, false=True)
+    # Change the config.
+    (
+        base_config
+        # Make sure the algorithm gets restored upon initialization.
+        .callbacks(on_algorithm_init=on_algorithm_init)
+        # Change training parameters considerably.
+        .training(
+            lr=0.0003,
+            train_batch_size=5000,
+            grad_clip=100.0,
+            gamma=0.996,
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+        )
+        # Make multi-CPU/GPU.
+        .learners(num_learners=2)
+        # Use more env runners and more envs per env runner.
+        .env_runners(num_env_runners=3, num_envs_per_env_runner=5)
+    )
+    # Update the stopping criterium to the final target return per episode.
+    stop = {metric: args.stop_reward}
+    # Run a new experiment with the (RLlib) callback `on_algorithm_init` restoring
+    # from the best checkpoint.
+    # Note that the new experiment starts again from iteration=0 (unlike when you
+    # use `tune.Tuner.restore()` after a crash or interrupted trial).
+    tuner_results = run_rllib_example_script_experiment(base_config, args, stop=stop)
+    # Assert that we have continued training with a different learning rate.
+    assert (
+        tuner_results[0].metrics[LEARNER_RESULTS][DEFAULT_MODULE_ID][
+            "default_optimizer_learning_rate"
+        ]
+        == base_config.lr
+        == 0.0003
+    )

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""Example extracting a checkpoint from n trials using one or more custom criteria.
+This example:
+    - runs a CartPole experiment with three different learning rates (three tune
+    "trials"). During the experiment, for each trial, we create a checkpoint at each
+    iteration.
+    - at the end of the experiment, we compare the trials and pick the one that
+    performed best, based on the criterion: Lowest episode count per single iteration
+    (for CartPole, a low episode count means the episodes are very long and thus the
+    reward is also very high).
+    - from that best trial (with the lowest episode count), we then pick those
+    checkpoints that a) have the lowest policy loss (good) and b) have the highest value
+    function loss (bad).
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+In the console output, you can see the performance of the three different learning
+rates used here:
++-----------------------------+------------+-----------------+--------+--------+
+| Trial name                  | status     | loc             |     lr |   iter |
+|-----------------------------+------------+-----------------+--------+--------+
+| PPO_CartPole-v1_d7dbe_00000 | TERMINATED | 127.0.0.1:98487 | 0.01   |     17 |
+| PPO_CartPole-v1_d7dbe_00001 | TERMINATED | 127.0.0.1:98488 | 0.001  |      8 |
+| PPO_CartPole-v1_d7dbe_00002 | TERMINATED | 127.0.0.1:98489 | 0.0001 |      9 |
++-----------------------------+------------+-----------------+--------+--------+
++------------------+-------+----------+----------------------+----------------------+
+|   total time (s) |    ts |   reward |   episode_reward_max |   episode_reward_min |
+|------------------+-------+----------+----------------------+----------------------+
+|          28.1068 | 39797 |   151.11 |                  500 |                   12 |
+|          13.304  | 18728 |   158.91 |                  500 |                   15 |
+|          14.8848 | 21069 |   167.36 |                  500 |                   13 |
++------------------+-------+----------+----------------------+----------------------+
++--------------------+
+|   episode_len_mean |
+|--------------------|
+|             151.11 |
+|             158.91 |
+|             167.36 |
++--------------------+
+"""
+from ray import tune
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    LEARNER_RESULTS,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+parser = add_rllib_example_script_args(
+    default_reward=450.0, default_timesteps=100000, default_iters=200
+)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # Force-set `args.checkpoint_freq` to 1.
+    args.checkpoint_freq = 1
+    # Simple generic config.
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("CartPole-v1")
+        # Run 3 trials, each w/ a different learning rate.
+        .training(lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341)
+    )
+    # Run tune for some iterations and generate checkpoints.
+    results = run_rllib_example_script_experiment(base_config, args)
+    # Get the best of the 3 trials by using some metric.
+    # NOTE: Choosing the min `episodes_this_iter` automatically picks the trial
+    # with the best performance (over the entire run (scope="all")):
+    # The fewer episodes, the longer each episode lasted, the more reward we
+    # got each episode.
+    # Setting scope to "last", "last-5-avg", or "last-10-avg" will only compare
+    # (using `mode=min|max`) the average values of the last 1, 5, or 10
+    # iterations with each other, respectively.
+    # Setting scope to "avg" will compare (using `mode`=min|max) the average
+    # values over the entire run.
+    metric = "env_runners/num_episodes"
+    # notice here `scope` is `all`, meaning for each trial,
+    # all results (not just the last one) will be examined.
+    best_result = results.get_best_result(metric=metric, mode="min", scope="all")
+    value_best_metric = best_result.metrics_dataframe[metric].min()
+    best_return_best = best_result.metrics_dataframe[
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    ].max()
+    print(
+        f"Best trial was the one with lr={best_result.metrics['config']['lr']}. "
+        f"Reached lowest episode count ({value_best_metric}) in a single iteration and "
+        f"an average return of {best_return_best}."
+    )
+    # Confirm, we picked the right trial.
+    assert (
+        value_best_metric
+        == results.get_dataframe(filter_metric=metric, filter_mode="min")[metric].min()
+    )
+    # Get the best checkpoints from the trial, based on different metrics.
+    # Checkpoint with the lowest policy loss value:
+    if args.enable_new_api_stack:
+        policy_loss_key = f"{LEARNER_RESULTS}/{DEFAULT_MODULE_ID}/policy_loss"
+    else:
+        policy_loss_key = "info/learner/default_policy/learner_stats/policy_loss"
+    best_result = results.get_best_result(metric=policy_loss_key, mode="min")
+    ckpt = best_result.checkpoint
+    lowest_policy_loss = best_result.metrics_dataframe[policy_loss_key].min()
+    print(f"Checkpoint w/ lowest policy loss ({lowest_policy_loss}): {ckpt}")
+    # Checkpoint with the highest value-function loss:
+    if args.enable_new_api_stack:
+        vf_loss_key = f"{LEARNER_RESULTS}/{DEFAULT_MODULE_ID}/vf_loss"
+    else:
+        vf_loss_key = "info/learner/default_policy/learner_stats/vf_loss"
+    best_result = results.get_best_result(metric=vf_loss_key, mode="max")
+    ckpt = best_result.checkpoint
+    highest_value_fn_loss = best_result.metrics_dataframe[vf_loss_key].max()
+    print(f"Checkpoint w/ highest value function loss: {ckpt}")
+    print(f"Highest value function loss: {highest_value_fn_loss}")

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/continue_training_from_checkpoint.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""Example showing how to restore an Algorithm from a checkpoint and resume training.
+Use the setup shown in this script if your experiments tend to crash after some time,
+and you would therefore like to make your setup more robust and fault-tolerant.
+This example:
+    - runs a single- or multi-agent CartPole experiment (for multi-agent, we use
+    different learning rates) thereby checkpointing the state of the Algorithm every n
+    iterations.
+    - stops the experiment due to an expected crash in the algorithm's main process
+    after a certain number of iterations.
+    - just for testing purposes, restores the entire algorithm from the latest
+    checkpoint and checks, whether the state of the restored algo exactly match the
+    state of the crashed one.
+    - then continues training with the restored algorithm until the desired final
+    episode return is reached.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=[0 or 2]
+--stop-reward-crash=[the episode return after which the algo should crash]
+--stop-reward=[the final episode return to achieve after(!) restoration from the
+checkpoint]
+`
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+First, you should see the initial tune.Tuner do it's thing:
+Trial status: 1 RUNNING
+Current time: 2024-06-03 12:03:39. Total running time: 30s
+Logical resource usage: 3.0/12 CPUs, 0/0 GPUs
+╭────────────────────────────────────────────────────────────────────────
+│ Trial name                    status       iter     total time (s)
+├────────────────────────────────────────────────────────────────────────
+│ PPO_CartPole-v1_7b1eb_00000   RUNNING         6             15.362
+╰────────────────────────────────────────────────────────────────────────
+───────────────────────────────────────────────────────────────────────╮
+..._sampled_lifetime     ..._trained_lifetime     ...episodes_lifetime │
+───────────────────────────────────────────────────────────────────────┤
+               24000                    24000                      340 │
+───────────────────────────────────────────────────────────────────────╯
+...
+then, you should see the experiment crashing as soon as the `--stop-reward-crash`
+has been reached:
+```RuntimeError: Intended crash after reaching trigger return.```
+At some point, the experiment should resume exactly where it left off (using
+the checkpoint and restored Tuner):
+Trial status: 1 RUNNING
+Current time: 2024-06-03 12:05:00. Total running time: 1min 0s
+Logical resource usage: 3.0/12 CPUs, 0/0 GPUs
+╭────────────────────────────────────────────────────────────────────────
+│ Trial name                    status       iter     total time (s)
+├────────────────────────────────────────────────────────────────────────
+│ PPO_CartPole-v1_7b1eb_00000   RUNNING        27            66.1451
+╰────────────────────────────────────────────────────────────────────────
+───────────────────────────────────────────────────────────────────────╮
+..._sampled_lifetime     ..._trained_lifetime     ...episodes_lifetime │
+───────────────────────────────────────────────────────────────────────┤
+              108000                   108000                      531 │
+───────────────────────────────────────────────────────────────────────╯
+And if you are using the `--as-test` option, you should see a finel message:
+```
+`env_runners/episode_return_mean` of 500.0 reached! ok
+```
+"""
+import re
+import time
+from ray import train, tune
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    check_learning_achieved,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+from ray.air.integrations.wandb import WandbLoggerCallback
+parser = add_rllib_example_script_args(
+    default_reward=500.0, default_timesteps=10000000, default_iters=2000
+)
+parser.add_argument(
+    "--stop-reward-crash",
+    type=float,
+    default=200.0,
+    help="Mean episode return after which the Algorithm should crash.",
+)
+# By default, set `args.checkpoint_freq` to 1 and `args.checkpoint_at_end` to True.
+parser.set_defaults(checkpoint_freq=1, checkpoint_at_end=True)
+class CrashAfterNIters(RLlibCallback):
+    """Callback that makes the algo crash after a certain avg. return is reached."""
+    def __init__(self):
+        super().__init__()
+        # We have to delay crashing by one iteration just so the checkpoint still
+        # gets created by Tune after(!) we have reached the trigger avg. return.
+        self._should_crash = False
+    def on_train_result(self, *, algorithm, metrics_logger, result, **kwargs):
+        # We had already reached the mean-return to crash, the last checkpoint written
+        # (the one from the previous iteration) should yield that exact avg. return.
+        if self._should_crash:
+            raise RuntimeError("Intended crash after reaching trigger return.")
+        # Reached crashing criterion, crash on next iteration.
+        elif result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= args.stop_reward_crash:
+            print(
+                "Reached trigger return of "
+                f"{result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}"
+            )
+            self._should_crash = True
+if __name__ == "__main__":
+    args = parser.parse_args()
+    register_env(
+        "ma_cart", lambda cfg: MultiAgentCartPole({"num_agents": args.num_agents})
+    )
+    # Simple generic config.
+    config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .api_stack(
+            enable_rl_module_and_learner=args.enable_new_api_stack,
+            enable_env_runner_and_connector_v2=args.enable_new_api_stack,
+        )
+        .environment("CartPole-v1" if args.num_agents == 0 else "ma_cart")
+        .env_runners(create_env_on_local_worker=True)
+        .training(lr=0.0001)
+        .callbacks(CrashAfterNIters)
+    )
+    # Tune config.
+    # Need a WandB callback?
+    tune_callbacks = []
+    if args.wandb_key:
+        project = args.wandb_project or (
+            args.algo.lower() + "-" + re.sub("\\W+", "-", str(config.env).lower())
+        )
+        tune_callbacks.append(
+            WandbLoggerCallback(
+                api_key=args.wandb_key,
+                project=args.wandb_project,
+                upload_checkpoints=False,
+                **({"name": args.wandb_run_name} if args.wandb_run_name else {}),
+            )
+        )
+    # Setup multi-agent, if required.
+    if args.num_agents > 0:
+        config.multi_agent(
+            policies={
+                f"p{aid}": PolicySpec(
+                    config=AlgorithmConfig.overrides(
+                        lr=5e-5
+                        * (aid + 1),  # agent 1 has double the learning rate as 0.
+                    )
+                )
+                for aid in range(args.num_agents)
+            },
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+    # Define some stopping criterion. Note that this criterion is an avg episode return
+    # to be reached. The stop criterion does not consider the built-in crash we are
+    # triggering through our callback.
+    stop = {
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+    # Run tune for some iterations and generate checkpoints.
+    tuner = tune.Tuner(
+        trainable=config.algo_class,
+        param_space=config,
+        run_config=train.RunConfig(
+            callbacks=tune_callbacks,
+            checkpoint_config=train.CheckpointConfig(
+                checkpoint_frequency=args.checkpoint_freq,
+                checkpoint_at_end=args.checkpoint_at_end,
+            ),
+            stop=stop,
+        ),
+    )
+    tuner_results = tuner.fit()
+    # Perform a very quick test to make sure our algo (upon restoration) did not lose
+    # its ability to perform well in the env.
+    # - Extract the best checkpoint.
+    metric = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}"
+    best_result = tuner_results.get_best_result(metric=metric, mode="max")
+    assert (
+        best_result.metrics[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_crash
+    )
+    # - Change our config, such that the restored algo will have an env on the local
+    # EnvRunner (to perform evaluation) and won't crash anymore (remove the crashing
+    # callback).
+    config.callbacks(None)
+    # Rebuild the algorithm (just for testing purposes).
+    test_algo = config.build()
+    # Load algo's state from best checkpoint.
+    test_algo.restore(best_result.checkpoint)
+    # Perform some checks on the restored state.
+    assert test_algo.training_iteration > 0
+    # Evaluate on the restored algorithm.
+    test_eval_results = test_algo.evaluate()
+    assert (
+        test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        >= args.stop_reward_crash
+    ), test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    # Train one iteration to make sure, the performance does not collapse (e.g. due
+    # to the optimizer weights not having been restored properly).
+    test_results = test_algo.train()
+    assert (
+        test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= args.stop_reward_crash
+    ), test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+    # Stop the test algorithm again.
+    test_algo.stop()
+    # Create a new Tuner from the existing experiment path (which contains the tuner's
+    # own checkpoint file). Note that even the WandB logging will be continued without
+    # creating a new WandB run name.
+    restored_tuner = tune.Tuner.restore(
+        path=tuner_results.experiment_path,
+        trainable=config.algo_class,
+        param_space=config,
+        # Important to set this to True b/c the previous trial had failed (due to our
+        # `CrashAfterNIters` callback).
+        resume_errored=True,
+    )
+    # Continue the experiment exactly where we left off.
+    tuner_results = restored_tuner.fit()
+    # Not sure, whether this is really necessary, but we have observed the WandB
+    # logger sometimes not logging some of the last iterations. This sleep here might
+    # give it enough time to do so.
+    time.sleep(20)
+    if args.as_test:
+        check_learning_achieved(tuner_results, args.stop_reward, metric=metric)

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_tf.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# @OldAPIStack
+import argparse
+import numpy as np
+import onnxruntime
+import os
+import shutil
+import ray
+import ray.rllib.algorithms.ppo as ppo
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2"],
+    default="tf2",
+    help="The TF framework specifier (either 'tf' or 'tf2').",
+)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # Configure our PPO Algorithm.
+    config = (
+        ppo.PPOConfig()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .env_runners(num_env_runners=1)
+        .framework(args.framework)
+    )
+    outdir = "export_tf"
+    if os.path.exists(outdir):
+        shutil.rmtree(outdir)
+    np.random.seed(1234)
+    # We will run inference with this test batch
+    test_data = {
+        "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32),
+    }
+    # Start Ray and initialize a PPO Algorithm
+    ray.init()
+    algo = config.build(env="CartPole-v1")
+    # You could train the model here via:
+    # algo.train()
+    # Let's run inference on the tensorflow model
+    policy = algo.get_policy()
+    result_tf, _ = policy.model(test_data)
+    # Evaluate tensor to fetch numpy array.
+    if args.framework == "tf":
+        with policy.get_session().as_default():
+            result_tf = result_tf.eval()
+    # This line will export the model to ONNX.
+    policy.export_model(outdir, onnx=11)
+    # Equivalent to:
+    # algo.export_policy_model(outdir, onnx=11)
+    # Import ONNX model.
+    exported_model_file = os.path.join(outdir, "model.onnx")
+    # Start an inference session for the ONNX model
+    session = onnxruntime.InferenceSession(exported_model_file, None)
+    # Pass the same test batch to the ONNX model (rename to match tensor names)
+    onnx_test_data = {f"default_policy/{k}:0": v for k, v in test_data.items()}
+    # Tf2 model stored differently from tf (static graph) model.
+    if args.framework == "tf2":
+        result_onnx = session.run(["fc_out"], {"observations": test_data["obs"]})
+    else:
+        result_onnx = session.run(
+            ["default_policy/model/fc_out/BiasAdd:0"],
+            onnx_test_data,
+        )
+    # These results should be equal!
+    print("TENSORFLOW", result_tf)
+    print("ONNX", result_onnx)
+    assert np.allclose(result_tf, result_onnx), "Model outputs are NOT equal. FAILED"
+    print("Model outputs are equal. PASSED")

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# @OldAPIStack
+from packaging.version import Version
+import numpy as np
+import ray
+import ray.rllib.algorithms.ppo as ppo
+import onnxruntime
+import os
+import shutil
+import torch
+if __name__ == "__main__":
+    # Configure our PPO Algorithm.
+    config = (
+        ppo.PPOConfig()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .env_runners(num_env_runners=1)
+        .framework("torch")
+    )
+    outdir = "export_torch"
+    if os.path.exists(outdir):
+        shutil.rmtree(outdir)
+    np.random.seed(1234)
+    # We will run inference with this test batch
+    test_data = {
+        "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32),
+        "state_ins": np.array([0.0], dtype=np.float32),
+    }
+    # Start Ray and initialize a PPO Algorithm.
+    ray.init()
+    algo = config.build(env="CartPole-v1")
+    # You could train the model here
+    # algo.train()
+    # Let's run inference on the torch model
+    policy = algo.get_policy()
+    result_pytorch, _ = policy.model(
+        {
+            "obs": torch.tensor(test_data["obs"]),
+        }
+    )
+    # Evaluate tensor to fetch numpy array
+    result_pytorch = result_pytorch.detach().numpy()
+    # This line will export the model to ONNX.
+    policy.export_model(outdir, onnx=11)
+    # Equivalent to:
+    # algo.export_policy_model(outdir, onnx=11)
+    # Import ONNX model.
+    exported_model_file = os.path.join(outdir, "model.onnx")
+    # Start an inference session for the ONNX model
+    session = onnxruntime.InferenceSession(exported_model_file, None)
+    # Pass the same test batch to the ONNX model
+    if Version(torch.__version__) < Version("1.9.0"):
+        # In torch < 1.9.0 the second input/output name gets mixed up
+        test_data["state_outs"] = test_data.pop("state_ins")
+    result_onnx = session.run(["output"], test_data)
+    # These results should be equal!
+    print("PYTORCH", result_pytorch)
+    print("ONNX", result_onnx)
+    assert np.allclose(
+        result_pytorch, result_onnx
+    ), "Model outputs are NOT equal. FAILED"
+    print("Model outputs are equal. PASSED")

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch_lstm.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# @OldAPIStack
+import numpy as np
+import onnxruntime
+import ray
+import ray.rllib.algorithms.ppo as ppo
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import add_rllib_example_script_args, check
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+torch, _ = try_import_torch()
+parser = add_rllib_example_script_args()
+parser.set_defaults(num_env_runners=1)
+class ONNXCompatibleWrapper(torch.nn.Module):
+    def __init__(self, original_model):
+        super(ONNXCompatibleWrapper, self).__init__()
+        self.original_model = original_model
+    def forward(self, a, b0, b1, c):
+        # Convert the separate tensor inputs back into the list format
+        # expected by the original model's forward method.
+        b = [b0, b1]
+        ret = self.original_model({"obs": a}, b, c)
+        # results, state_out_0, state_out_1
+        return ret[0], ret[1][0], ret[1][1]
+if __name__ == "__main__":
+    args = parser.parse_args()
+    assert (
+        not args.enable_new_api_stack
+    ), "Must NOT set --enable-new-api-stack when running this script!"
+    ray.init(local_mode=args.local_mode)
+    # Configure our PPO Algorithm.
+    config = (
+        ppo.PPOConfig()
+        # ONNX is not supported by RLModule API yet.
+        .api_stack(
+            enable_rl_module_and_learner=args.enable_new_api_stack,
+            enable_env_runner_and_connector_v2=args.enable_new_api_stack,
+        )
+        .environment("CartPole-v1")
+        .env_runners(num_env_runners=args.num_env_runners)
+        .training(model={"use_lstm": True})
+    )
+    B = 3
+    T = 5
+    LSTM_CELL = 256
+    # Input data for a python inference forward call.
+    test_data_python = {
+        "obs": np.random.uniform(0, 1.0, size=(B * T, 4)).astype(np.float32),
+        "state_ins": [
+            np.random.uniform(0, 1.0, size=(B, LSTM_CELL)).astype(np.float32),
+            np.random.uniform(0, 1.0, size=(B, LSTM_CELL)).astype(np.float32),
+        ],
+        "seq_lens": np.array([T] * B, np.float32),
+    }
+    # Input data for the ONNX session.
+    test_data_onnx = {
+        "obs": test_data_python["obs"],
+        "state_in_0": test_data_python["state_ins"][0],
+        "state_in_1": test_data_python["state_ins"][1],
+        "seq_lens": test_data_python["seq_lens"],
+    }
+    # Input data for compiling the ONNX model.
+    test_data_onnx_input = convert_to_torch_tensor(test_data_onnx)
+    # Initialize a PPO Algorithm.
+    algo = config.build()
+    # You could train the model here
+    # algo.train()
+    # Let's run inference on the torch model
+    policy = algo.get_policy()
+    result_pytorch, _ = policy.model(
+        {
+            "obs": torch.tensor(test_data_python["obs"]),
+        },
+        [
+            torch.tensor(test_data_python["state_ins"][0]),
+            torch.tensor(test_data_python["state_ins"][1]),
+        ],
+        torch.tensor(test_data_python["seq_lens"]),
+    )
+    # Evaluate tensor to fetch numpy array
+    result_pytorch = result_pytorch.detach().numpy()
+    # Wrap the actual ModelV2 with the torch wrapper above to make this all work with
+    # LSTMs (extra `state` in- and outputs and `seq_lens` inputs).
+    onnx_compatible = ONNXCompatibleWrapper(policy.model)
+    exported_model_file = "model.onnx"
+    input_names = [
+        "obs",
+        "state_in_0",
+        "state_in_1",
+        "seq_lens",
+    ]
+    # This line will export the model to ONNX.
+    torch.onnx.export(
+        onnx_compatible,
+        tuple(test_data_onnx_input[n] for n in input_names),
+        exported_model_file,
+        export_params=True,
+        opset_version=11,
+        do_constant_folding=True,
+        input_names=input_names,
+        output_names=[
+            "output",
+            "state_out_0",
+            "state_out_1",
+        ],
+        dynamic_axes={k: {0: "batch_size"} for k in input_names},
+    )
+    # Start an inference session for the ONNX model.
+    session = onnxruntime.InferenceSession(exported_model_file, None)
+    result_onnx = session.run(["output"], test_data_onnx)
+    # These results should be equal!
+    print("PYTORCH", result_pytorch)
+    print("ONNX", result_onnx[0])
+    check(result_pytorch, result_onnx[0])
+    print("Model outputs are equal. PASSED")

.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""Example demonstrating how to load module weights for 1 of n agents from a checkpoint.
+This example:
+    - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies, p0, p1, etc..
+    - Saves a checkpoint of the `MultiRLModule` every `--checkpoint-freq`
+    iterations.
+    - Stops the experiments after the agents reach a combined return of -800.
+    - Picks the best checkpoint by combined return and restores p0 from it.
+    - Runs a second experiment with the restored `RLModule` for p0 and
+    a fresh `RLModule` for the other policies.
+    - Stops the second experiment after the agents reach a combined return of -800.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-agents=2
+--checkpoint-freq=20 --checkpoint-at-end`
+Control the number of agents and policies (RLModules) via --num-agents and
+--num-policies.
+Control the number of checkpoints by setting `--checkpoint-freq` to a value > 0.
+Note that the checkpoint frequency is per iteration and this example needs at
+least a single checkpoint to load the RLModule weights for policy 0.
+If `--checkpoint-at-end` is set, a checkpoint will be saved at the end of the
+experiment.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+You should expect a reward of -400.0 eventually being achieved by a simple
+single PPO policy. In the second run of the experiment, the MultiRLModule weights
+for policy 0 are restored from the checkpoint of the first run. The reward for a
+single agent should be -400.0 again, but the training time should be shorter
+(around 30 iterations instead of 190) due to the fact that one policy is already
+an expert from the get go.
+"""
+from pathlib import Path
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.callbacks import DefaultCallbacks
+from ray.rllib.core import (
+    COMPONENT_LEARNER,
+    COMPONENT_LEARNER_GROUP,
+    COMPONENT_RL_MODULE,
+)
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    check,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+parser = add_rllib_example_script_args(
+    # Pendulum-v1 sum of 2 agents (each agent reaches -250).
+    default_reward=-500.0,
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    checkpoint_freq=1,
+    num_agents=2,
+)
+# TODO (sven): This arg is currently ignored (hard-set to 2).
+parser.add_argument("--num-policies", type=int, default=2)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # Register our environment with tune.
+    if args.num_agents > 1:
+        register_env(
+            "env",
+            lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}),
+        )
+    else:
+        raise ValueError(
+            f"`num_agents` must be > 1, but is {args.num_agents}."
+            "Read the script docstring for more information."
+        )
+    assert args.checkpoint_freq > 0, (
+        "This example requires at least one checkpoint to load the RLModule "
+        "weights for policy 0."
+    )
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("env")
+        .training(
+            train_batch_size_per_learner=512,
+            minibatch_size=64,
+            lambda_=0.1,
+            gamma=0.95,
+            lr=0.0003,
+            vf_clip_param=10.0,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(fcnet_activation="relu"),
+        )
+    )
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+    # Augment the base config with further settings and train the agents.
+    results = run_rllib_example_script_experiment(base_config, args, keep_ray_up=True)
+    # Now swap in the RLModule weights for policy 0.
+    chkpt_path = results.get_best_result().checkpoint.path
+    p_0_module_state_path = (
+        Path(chkpt_path)  # <- algorithm's checkpoint dir
+        / COMPONENT_LEARNER_GROUP  # <- learner group
+        / COMPONENT_LEARNER  # <- learner
+        / COMPONENT_RL_MODULE  # <- MultiRLModule
+        / "p0"  # <- (single) RLModule
+    )
+    class LoadP0OnAlgoInitCallback(DefaultCallbacks):
+        def on_algorithm_init(self, *, algorithm, **kwargs):
+            module_p0 = algorithm.get_module("p0")
+            weight_before = convert_to_numpy(next(iter(module_p0.parameters())))
+            algorithm.restore_from_path(
+                p_0_module_state_path,
+                component=(
+                    COMPONENT_LEARNER_GROUP
+                    + "/"
+                    + COMPONENT_LEARNER
+                    + "/"
+                    + COMPONENT_RL_MODULE
+                    + "/p0"
+                ),
+            )
+            # Make sure weights were updated.
+            weight_after = convert_to_numpy(next(iter(module_p0.parameters())))
+            check(weight_before, weight_after, false=True)
+    base_config.callbacks(LoadP0OnAlgoInitCallback)
+    # Define stopping criteria.
+    stop = {
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -800.0,
+        f"{ENV_RUNNER_RESULTS}/{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 100000,
+        TRAINING_ITERATION: 100,
+    }
+    # Run the experiment again with the restored MultiRLModule.
+    run_rllib_example_script_experiment(base_config, args, stop=stop)

.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (201 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/count_based_curiosity.cpython-311.pyc ADDED Viewed

Binary file (5.61 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc ADDED Viewed

Binary file (5.46 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/intrinsic_curiosity_model_based_curiosity.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/count_based_curiosity.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""Example of using a count-based curiosity mechanism to learn in sparse-rewards envs.
+This example:
+    - demonstrates how to define your own count-based curiosity ConnectorV2 piece
+    that computes intrinsic rewards based on simple observation counts and adds these
+    intrinsic rewards to the "main" (extrinsic) rewards.
+    - shows how this connector piece overrides the main (extrinsic) rewards in the
+    episode and thus demonstrates how to do reward shaping in general with RLlib.
+    - shows how to plug this connector piece into your algorithm's config.
+    - uses Tune and RLlib to learn the env described above and compares 2
+    algorithms, one that does use curiosity vs one that does not.
+We use a FrozenLake (sparse reward) environment with a map size of 8x8 and a time step
+limit of 14 to make it almost impossible for a non-curiosity based policy to learn.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+Use the `--no-curiosity` flag to disable curiosity learning and force your policy
+to be trained on the task w/o the use of intrinsic rewards. With this option, the
+algorithm should NOT succeed.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+In the console output, you can see that only a PPO policy that uses curiosity can
+actually learn.
+Policy using count-based curiosity:
++-------------------------------+------------+--------+------------------+
+| Trial name                    | status     |   iter |   total time (s) |
+|                               |            |        |                  |
+|-------------------------------+------------+--------+------------------+
+| PPO_FrozenLake-v1_109de_00000 | TERMINATED |     48 |            44.46 |
++-------------------------------+------------+--------+------------------+
++------------------------+-------------------------+------------------------+
+|    episode_return_mean |   num_episodes_lifetime |   num_env_steps_traine |
+|                        |                         |             d_lifetime |
+|------------------------+-------------------------+------------------------|
+|                   0.99 |                   12960 |                 194000 |
++------------------------+-------------------------+------------------------+
+Policy NOT using curiosity:
+[DOES NOT LEARN AT ALL]
+"""
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.connectors.classes.count_based_curiosity import (
+    CountBasedCuriosity,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+parser = add_rllib_example_script_args(
+    default_reward=0.99, default_iters=200, default_timesteps=1000000
+)
+parser.set_defaults(enable_new_api_stack=True)
+parser.add_argument(
+    "--intrinsic-reward-coeff",
+    type=float,
+    default=1.0,
+    help="The weight with which to multiply intrinsic rewards before adding them to "
+    "the extrinsic ones (default is 1.0).",
+)
+parser.add_argument(
+    "--no-curiosity",
+    action="store_true",
+    help="Whether to NOT use count-based curiosity.",
+)
+ENV_OPTIONS = {
+    "is_slippery": False,
+    # Use this hard-to-solve 8x8 map with lots of holes (H) to fall into and only very
+    # few valid paths from the starting state (S) to the goal state (G).
+    "desc": [
+        "SFFHFFFH",
+        "FFFHFFFF",
+        "FFFHHFFF",
+        "FFFFFFFH",
+        "HFFHFFFF",
+        "HHFHFFHF",
+        "FFFHFHHF",
+        "FHFFFFFG",
+    ],
+    # Limit the number of steps the agent is allowed to make in the env to
+    # make it almost impossible to learn without (count-based) curiosity.
+    "max_episode_steps": 14,
+}
+if __name__ == "__main__":
+    args = parser.parse_args()
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "FrozenLake-v1",
+            env_config=ENV_OPTIONS,
+        )
+        .env_runners(
+            num_envs_per_env_runner=5,
+            # Flatten discrete observations (into one-hot vectors).
+            env_to_module_connector=lambda env: FlattenObservations(),
+        )
+        .training(
+            # The main code in this example: We add the `CountBasedCuriosity` connector
+            # piece to our Learner connector pipeline.
+            # This pipeline is fed with collected episodes (either directly from the
+            # EnvRunners in on-policy fashion or from a replay buffer) and converts
+            # these episodes into the final train batch. The added piece computes
+            # intrinsic rewards based on simple observation counts and add them to
+            # the "main" (extrinsic) rewards.
+            learner_connector=(
+                None if args.no_curiosity else lambda *ags, **kw: CountBasedCuriosity()
+            ),
+            num_epochs=10,
+            vf_loss_coeff=0.01,
+        )
+        .rl_module(model_config=DefaultModelConfig(vf_share_layers=True))
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/euclidian_distance_based_curiosity.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Example of a euclidian-distance curiosity mechanism to learn in sparse-rewards envs.
+This example:
+    - demonstrates how to define your own euclidian-distance-based curiosity ConnectorV2
+    piece that computes intrinsic rewards based on the delta between incoming
+    observations and some set of already stored (prior) observations. Thereby, the
+    further away the incoming observation is from the already stored ones, the higher
+    its corresponding intrinsic reward.
+    - shows how this connector piece adds the intrinsic reward to the corresponding
+    "main" (extrinsic) reward and overrides the value in the "rewards" key in the
+    episode. It thus demonstrates how to do reward shaping in general with RLlib.
+    - shows how to plug this connector piece into your algorithm's config.
+    - uses Tune and RLlib to learn the env described above and compares 2
+    algorithms, one that does use curiosity vs one that does not.
+We use the MountainCar-v0 environment, a sparse-reward env that is very hard to learn
+for a regular PPO algorithm.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+Use the `--no-curiosity` flag to disable curiosity learning and force your policy
+to be trained on the task w/o the use of intrinsic rewards. With this option, the
+algorithm should NOT succeed.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+In the console output, you can see that only a PPO policy that uses curiosity can
+actually learn.
+Policy using count-based curiosity:
++-------------------------------+------------+--------+------------------+
+| Trial name                    | status     |   iter |   total time (s) |
+|                               |            |        |                  |
+|-------------------------------+------------+--------+------------------+
+| PPO_FrozenLake-v1_109de_00000 | TERMINATED |     48 |            44.46 |
++-------------------------------+------------+--------+------------------+
++------------------------+-------------------------+------------------------+
+|    episode_return_mean |   num_episodes_lifetime |   num_env_steps_traine |
+|                        |                         |             d_lifetime |
+|------------------------+-------------------------+------------------------|
+|                   0.99 |                   12960 |                 194000 |
++------------------------+-------------------------+------------------------+
+Policy NOT using curiosity:
+[DOES NOT LEARN AT ALL]
+"""
+from ray.rllib.connectors.env_to_module import MeanStdFilter
+from ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity import (
+    EuclidianDistanceBasedCuriosity,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+# TODO (sven): SB3's PPO learns MountainCar-v0 until a reward of ~-110.
+#  We might have to play around some more with different initializations, etc..
+#  to get to these results as well.
+parser = add_rllib_example_script_args(
+    default_reward=-140.0, default_iters=2000, default_timesteps=1000000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_env_runners=4,
+)
+parser.add_argument(
+    "--intrinsic-reward-coeff",
+    type=float,
+    default=0.0001,
+    help="The weight with which to multiply intrinsic rewards before adding them to "
+    "the extrinsic ones (default is 0.0001).",
+)
+parser.add_argument(
+    "--no-curiosity",
+    action="store_true",
+    help="Whether to NOT use count-based curiosity.",
+)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("MountainCar-v0")
+        .env_runners(
+            env_to_module_connector=lambda env: MeanStdFilter(),
+            num_envs_per_env_runner=5,
+        )
+        .training(
+            # The main code in this example: We add the
+            # `EuclidianDistanceBasedCuriosity` connector piece to our Learner connector
+            # pipeline. This pipeline is fed with collected episodes (either directly
+            # from the EnvRunners in on-policy fashion or from a replay buffer) and
+            # converts these episodes into the final train batch. The added piece
+            # computes intrinsic rewards based on simple observation counts and add them
+            # to the "main" (extrinsic) rewards.
+            learner_connector=(
+                None
+                if args.no_curiosity
+                else lambda *ags, **kw: EuclidianDistanceBasedCuriosity()
+            ),
+            # train_batch_size_per_learner=512,
+            grad_clip=20.0,
+            entropy_coeff=0.003,
+            gamma=0.99,
+            lr=0.0002,
+            lambda_=0.98,
+        )
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""Example of implementing and training with an intrinsic curiosity model (ICM).
+This type of curiosity-based learning trains a simplified model of the environment
+dynamics based on three networks:
+1) Embedding observations into latent space ("feature" network).
+2) Predicting the action, given two consecutive embedded observations
+("inverse" network).
+3) Predicting the next embedded obs, given an obs and action
+("forward" network).
+The less the ICM is able to predict the actually observed next feature vector,
+given obs and action (through the forwards network), the larger the
+"intrinsic reward", which will be added to the extrinsic reward of the agent.
+Therefore, if a state transition was unexpected, the agent becomes
+"curious" and will further explore this transition leading to better
+exploration in sparse rewards environments.
+For more details, see here:
+[1] Curiosity-driven Exploration by Self-supervised Prediction
+Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017.
+https://arxiv.org/pdf/1705.05363.pdf
+This example:
+    - demonstrates how to write a custom RLModule, representing the ICM from the paper
+    above. Note that this custom RLModule does not belong to any individual agent.
+    - demonstrates how to write a custom (PPO) TorchLearner that a) adds the ICM to its
+    MultiRLModule, b) trains the regular PPO Policy plus the ICM module, using the
+    PPO parent loss and the ICM's RLModule's own loss function.
+We use a FrozenLake (sparse reward) environment with a custom map size of 12x12 and a
+hard time step limit of 22 to make it almost impossible for a non-curiosity based
+learners to learn a good policy.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+Use the `--no-curiosity` flag to disable curiosity learning and force your policy
+to be trained on the task w/o the use of intrinsic rewards. With this option, the
+algorithm should NOT succeed.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+In the console output, you can see that only a PPO policy that uses curiosity can
+actually learn.
+Policy using ICM-based curiosity:
++-------------------------------+------------+-----------------+--------+
+| Trial name                    | status     | loc             |   iter |
+|-------------------------------+------------+-----------------+--------+
+| PPO_FrozenLake-v1_52ab2_00000 | TERMINATED | 127.0.0.1:73318 |    392 |
++-------------------------------+------------+-----------------+--------+
++------------------+--------+----------+--------------------+
+|   total time (s) |     ts |   reward |   episode_len_mean |
+|------------------+--------+----------+--------------------|
+|          236.652 | 786000 |      1.0 |               22.0 |
++------------------+--------+----------+--------------------+
+Policy NOT using curiosity:
+[DOES NOT LEARN AT ALL]
+"""
+from collections import defaultdict
+import numpy as np
+from ray import tune
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.examples.learners.classes.intrinsic_curiosity_learners import (
+    DQNTorchLearnerWithCuriosity,
+    PPOTorchLearnerWithCuriosity,
+)
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.learners.classes.intrinsic_curiosity_learners import (
+    ICM_MODULE_ID,
+)
+from ray.rllib.examples.rl_modules.classes.intrinsic_curiosity_model_rlm import (
+    IntrinsicCuriosityModel,
+)
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+parser = add_rllib_example_script_args(
+    default_iters=2000,
+    default_timesteps=10000000,
+    default_reward=0.9,
+)
+parser.set_defaults(enable_new_api_stack=True)
+class MeasureMaxDistanceToStart(RLlibCallback):
+    """Callback measuring the dist of the agent to its start position in FrozenLake-v1.
+    Makes the naive assumption that the start position ("S") is in the upper left
+    corner of the used map.
+    Uses the MetricsLogger to record the (euclidian) distance value.
+    """
+    def __init__(self):
+        super().__init__()
+        self.max_dists = defaultdict(float)
+        self.max_dists_lifetime = 0.0
+    def on_episode_step(
+        self,
+        *,
+        episode,
+        env_runner,
+        metrics_logger,
+        env,
+        env_index,
+        rl_module,
+        **kwargs,
+    ):
+        num_rows = env.envs[0].unwrapped.nrow
+        num_cols = env.envs[0].unwrapped.ncol
+        obs = np.argmax(episode.get_observations(-1))
+        row = obs // num_cols
+        col = obs % num_rows
+        curr_dist = (row**2 + col**2) ** 0.5
+        if curr_dist > self.max_dists[episode.id_]:
+            self.max_dists[episode.id_] = curr_dist
+    def on_episode_end(
+        self,
+        *,
+        episode,
+        env_runner,
+        metrics_logger,
+        env,
+        env_index,
+        rl_module,
+        **kwargs,
+    ):
+        # Compute current maximum distance across all running episodes
+        # (including the just ended one).
+        max_dist = max(self.max_dists.values())
+        metrics_logger.log_value(
+            key="max_dist_travelled_across_running_episodes",
+            value=max_dist,
+            window=10,
+        )
+        if max_dist > self.max_dists_lifetime:
+            self.max_dists_lifetime = max_dist
+        del self.max_dists[episode.id_]
+    def on_sample_end(
+        self,
+        *,
+        env_runner,
+        metrics_logger,
+        samples,
+        **kwargs,
+    ):
+        metrics_logger.log_value(
+            key="max_dist_travelled_lifetime",
+            value=self.max_dists_lifetime,
+            window=1,
+        )
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.algo not in ["DQN", "PPO"]:
+        raise ValueError(
+            "Curiosity example only implemented for either DQN or PPO! See the "
+        )
+    base_config = (
+        tune.registry.get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "FrozenLake-v1",
+            env_config={
+                # Use a 12x12 map.
+                "desc": [
+                    "SFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFF",
+                    "FFFFFFFFFFFG",
+                ],
+                "is_slippery": False,
+                # Limit the number of steps the agent is allowed to make in the env to
+                # make it almost impossible to learn without the curriculum.
+                "max_episode_steps": 22,
+            },
+        )
+        .callbacks(MeasureMaxDistanceToStart)
+        .env_runners(
+            num_envs_per_env_runner=5 if args.algo == "PPO" else 1,
+            env_to_module_connector=lambda env: FlattenObservations(),
+        )
+        .training(
+            learner_config_dict={
+                # Intrinsic reward coefficient.
+                "intrinsic_reward_coeff": 0.05,
+                # Forward loss weight (vs inverse dynamics loss). Total ICM loss is:
+                # L(total ICM) = (
+                #     `forward_loss_weight` * L(forward)
+                #     + (1.0 - `forward_loss_weight`) * L(inverse_dyn)
+                # )
+                "forward_loss_weight": 0.2,
+            }
+        )
+        .rl_module(
+            rl_module_spec=MultiRLModuleSpec(
+                rl_module_specs={
+                    # The "main" RLModule (policy) to be trained by our algo.
+                    DEFAULT_MODULE_ID: RLModuleSpec(
+                        **(
+                            {"model_config": {"vf_share_layers": True}}
+                            if args.algo == "PPO"
+                            else {}
+                        ),
+                    ),
+                    # The intrinsic curiosity model.
+                    ICM_MODULE_ID: RLModuleSpec(
+                        module_class=IntrinsicCuriosityModel,
+                        # Only create the ICM on the Learner workers, NOT on the
+                        # EnvRunners.
+                        learner_only=True,
+                        # Configure the architecture of the ICM here.
+                        model_config={
+                            "feature_dim": 288,
+                            "feature_net_hiddens": (256, 256),
+                            "feature_net_activation": "relu",
+                            "inverse_net_hiddens": (256, 256),
+                            "inverse_net_activation": "relu",
+                            "forward_net_hiddens": (256, 256),
+                            "forward_net_activation": "relu",
+                        },
+                    ),
+                }
+            ),
+            # Use a different learning rate for training the ICM.
+            algorithm_config_overrides_per_module={
+                ICM_MODULE_ID: AlgorithmConfig.overrides(lr=0.0005)
+            },
+        )
+    )
+    # Set PPO-specific hyper-parameters.
+    if args.algo == "PPO":
+        base_config.training(
+            num_epochs=6,
+            # Plug in the correct Learner class.
+            learner_class=PPOTorchLearnerWithCuriosity,
+            train_batch_size_per_learner=2000,
+            lr=0.0003,
+        )
+    elif args.algo == "DQN":
+        base_config.training(
+            # Plug in the correct Learner class.
+            learner_class=DQNTorchLearnerWithCuriosity,
+            train_batch_size_per_learner=128,
+            lr=0.00075,
+            replay_buffer_config={
+                "type": "PrioritizedEpisodeReplayBuffer",
+                "capacity": 500000,
+                "alpha": 0.6,
+                "beta": 0.4,
+            },
+            # Epsilon exploration schedule for DQN.
+            epsilon=[[0, 1.0], [500000, 0.05]],
+            n_step=(3, 5),
+            double_q=True,
+            dueling=True,
+        )
+    success_key = f"{ENV_RUNNER_RESULTS}/max_dist_travelled_across_running_episodes"
+    stop = {
+        success_key: 12.0,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+    }
+    run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        success_metric={success_key: stop[success_key]},
+    )

.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (200 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_experiment.cpython-311.pyc ADDED Viewed

Binary file (8.52 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_logger.cpython-311.pyc ADDED Viewed

Binary file (4.82 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_progress_reporter.cpython-311.pyc ADDED Viewed

Binary file (4.43 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""Example of a custom Ray Tune experiment wrapping an RLlib Algorithm.
+You should only use such a customized workflow if the following conditions apply:
+- You know exactly what you are doing :)
+- Configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig
+is not sufficient and doesn't allow you to shape the Algorithm into behaving the way
+you'd like. Note that for complex, custom evaluation procedures there are many
+AlgorithmConfig options one can use (for more details, see:
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py).  # noqa
+- Subclassing an RLlib Algorithm class and overriding the new class' `training_step`
+method is not sufficient and doesn't allow you to define the algorithm's execution
+logic the way you'd like. See an example here on how to customize the algorithm's
+`training_step()` method:
+https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py  # noqa
+How to run this script
+----------------------
+`python [script file name].py`
+Results to expect
+-----------------
+You should see the following output (at the end of the experiment) in your console:
+╭───────────────────────────────────────────────────────────────────────────────────────
+│ Trial name                              status         iter     total time (s)      ts
+├───────────────────────────────────────────────────────────────────────────────────────
+│ my_experiment_CartPole-v1_77083_00000   TERMINATED       10            36.7799   60000
+╰───────────────────────────────────────────────────────────────────────────────────────
+╭───────────────────────────────────────────────────────╮
+│     reward    episode_len_mean     episodes_this_iter │
+├───────────────────────────────────────────────────────┤
+│    254.821             254.821                     12 │
+╰───────────────────────────────────────────────────────╯
+evaluation episode returns=[500.0, 500.0, 500.0]
+Note that evaluation results (on the CartPole-v1 env) should be close to perfect
+(episode return of ~500.0) as we are acting greedily inside the evaluation procedure.
+"""
+from typing import Dict
+import numpy as np
+from ray import train, tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME
+torch, _ = try_import_torch()
+def my_experiment(config: Dict):
+    # Extract the number of iterations to run from the config.
+    train_iterations = config.pop("train-iterations", 2)
+    eval_episodes_to_do = config.pop("eval-episodes", 1)
+    config = (
+        PPOConfig()
+        .update_from_dict(config)
+        .api_stack(enable_rl_module_and_learner=True)
+        .environment("CartPole-v1")
+    )
+    # Train for n iterations with high LR.
+    config.training(lr=0.001)
+    algo_high_lr = config.build()
+    for _ in range(train_iterations):
+        train_results = algo_high_lr.train()
+        # Add the phase to the result dict.
+        train_results["phase"] = 1
+        train.report(train_results)
+        phase_high_lr_time = train_results[NUM_ENV_STEPS_SAMPLED_LIFETIME]
+    checkpoint_training_high_lr = algo_high_lr.save()
+    algo_high_lr.stop()
+    # Train for n iterations with low LR.
+    config.training(lr=0.00001)
+    algo_low_lr = config.build()
+    # Load state from the high-lr algo into this one.
+    algo_low_lr.restore(checkpoint_training_high_lr)
+    for _ in range(train_iterations):
+        train_results = algo_low_lr.train()
+        # Add the phase to the result dict.
+        train_results["phase"] = 2
+        # keep time moving forward
+        train_results[NUM_ENV_STEPS_SAMPLED_LIFETIME] += phase_high_lr_time
+        train.report(train_results)
+    checkpoint_training_low_lr = algo_low_lr.save()
+    algo_low_lr.stop()
+    # After training, run a manual evaluation procedure.
+    # Set the number of EnvRunners for collecting training data to 0 (local
+    # worker only).
+    config.env_runners(num_env_runners=0)
+    eval_algo = config.build()
+    # Load state from the low-lr algo into this one.
+    eval_algo.restore(checkpoint_training_low_lr)
+    # The algo's local worker (SingleAgentEnvRunner) that holds a
+    # gym.vector.Env object and an RLModule for computing actions.
+    local_env_runner = eval_algo.env_runner
+    # Extract the gymnasium env object from the created algo (its local
+    # SingleAgentEnvRunner worker). Note that the env in this single-agent
+    # case is a gymnasium vector env and that we get its first sub-env here.
+    env = local_env_runner.env.unwrapped.envs[0]
+    # The local worker (SingleAgentEnvRunner)
+    rl_module = local_env_runner.module
+    # Run a very simple env loop and add up rewards over a single episode.
+    obs, infos = env.reset()
+    episode_returns = []
+    episode_lengths = []
+    sum_rewards = length = 0
+    num_episodes = 0
+    while num_episodes < eval_episodes_to_do:
+        # Call the RLModule's `forward_inference()` method to compute an
+        # action.
+        rl_module_out = rl_module.forward_inference(
+            {
+                "obs": torch.from_numpy(np.expand_dims(obs, 0)),  # <- add B=1
+            }
+        )
+        action_logits = rl_module_out["action_dist_inputs"][0]  # <- remove B=1
+        action = np.argmax(action_logits.detach().cpu().numpy())  # act greedily
+        # Step the env.
+        obs, reward, terminated, truncated, info = env.step(action)
+        # Acculumate stats and reset the env, if necessary.
+        sum_rewards += reward
+        length += 1
+        if terminated or truncated:
+            num_episodes += 1
+            episode_returns.append(sum_rewards)
+            episode_lengths.append(length)
+            sum_rewards = length = 0
+            obs, infos = env.reset()
+    # Compile evaluation results.
+    eval_results = {
+        "eval_returns": episode_returns,
+        "eval_episode_lengths": episode_lengths,
+    }
+    # Combine the most recent training results with the just collected
+    # evaluation results.
+    results = {**train_results, **eval_results}
+    # Report everything.
+    train.report(results)
+if __name__ == "__main__":
+    base_config = PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0)
+    # Convert to a plain dict for Tune. Note that this is usually not needed, you can
+    # pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object.
+    # However, for demonstration purposes, we show here how you can add other, arbitrary
+    # keys to the plain config dict and then pass these keys to your custom experiment
+    # function.
+    config_dict = base_config.to_dict()
+    # Set a Special flag signalling `my_experiment` how many training steps to
+    # perform on each: the high learning rate and low learning rate.
+    config_dict["train-iterations"] = 5
+    # Set a Special flag signalling `my_experiment` how many episodes to evaluate for.
+    config_dict["eval-episodes"] = 3
+    training_function = tune.with_resources(
+        my_experiment,
+        resources=base_config.algo_class.default_resource_request(base_config),
+    )
+    tuner = tune.Tuner(
+        training_function,
+        # Pass in your config dict.
+        param_space=config_dict,
+    )
+    results = tuner.fit()
+    best_results = results.get_best_result()
+    print(f"evaluation episode returns={best_results.metrics['eval_returns']}")

.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_logger.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""Example showing how to define a custom Logger class for an RLlib Algorithm.
+The script uses the AlgorithmConfig's `debugging` API to setup the custom Logger:
+```
+config.debugging(logger_config={
+    "type": [some Logger subclass],
+    "ctor_arg1", ...,
+    "ctor_arg2", ...,
+})
+```
+All keys other than "type" in the logger_config dict will be passed into the Logger
+class's constructor.
+By default (logger_config=None), RLlib will construct a Ray Tune UnifiedLogger object,
+which logs results to JSON, CSV, and TBX.
+NOTE that a custom Logger is different from a custom `ProgressReporter`, which defines,
+how the (frequent) outputs to your console will be formatted. To see an example on how
+to write your own Progress reporter, see:
+https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_progress_reporter.py  # noqa
+Below examples include:
+- Disable logging entirely.
+- Using only one of tune's Json, CSV, or TBX loggers.
+- Defining a custom logger (by sub-classing tune.logger.py::Logger).
+How to run this script
+----------------------
+`python [script file name].py`
+Results to expect
+-----------------
+You should see log lines similar to the following in your console output. Note that
+these logged lines will mix with the ones produced by Tune's default ProgressReporter.
+See above link on how to setup a custom one.
+ABC Avg-return: 20.609375; pi-loss: -0.02921550187703246
+ABC Avg-return: 32.28688524590164; pi-loss: -0.023369029412534572
+ABC Avg-return: 51.92; pi-loss: -0.017113141975661456
+ABC Avg-return: 76.16; pi-loss: -0.01305474770361625
+ABC Avg-return: 100.54; pi-loss: -0.007665307738129169
+ABC Avg-return: 132.33; pi-loss: -0.005010405003325517
+ABC Avg-return: 169.65; pi-loss: -0.008397869592997183
+ABC Avg-return: 203.17; pi-loss: -0.005611495616764371
+Flushing
+Closing
+"""
+from ray import air, tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core import DEFAULT_MODULE_ID
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    LEARNER_RESULTS,
+)
+from ray.tune.logger import Logger, LegacyLoggerCallback
+class MyPrintLogger(Logger):
+    """Logs results by simply printing out everything."""
+    def _init(self):
+        # Custom init function.
+        print("Initializing ...")
+        # Setting up our log-line prefix.
+        self.prefix = self.config.get("logger_config").get("prefix")
+    def on_result(self, result: dict):
+        # Define, what should happen on receiving a `result` (dict).
+        mean_return = result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]
+        pi_loss = result[LEARNER_RESULTS][DEFAULT_MODULE_ID]["policy_loss"]
+        print(f"{self.prefix} " f"Avg-return: {mean_return} " f"pi-loss: {pi_loss}")
+    def close(self):
+        # Releases all resources used by this logger.
+        print("Closing")
+    def flush(self):
+        # Flushing all possible disk writes to permanent storage.
+        print("Flushing", flush=True)
+if __name__ == "__main__":
+    config = (
+        PPOConfig().environment("CartPole-v1")
+        # Setting up a custom logger config.
+        # ----------------------------------
+        # The following are different examples of custom logging setups:
+        # 1) Disable logging entirely.
+        # "logger_config": {
+        #     # Use the tune.logger.NoopLogger class for no logging.
+        #     "type": "ray.tune.logger.NoopLogger",
+        # },
+        # 2) Use tune's JsonLogger only.
+        # Alternatively, use `CSVLogger` or `TBXLogger` instead of
+        # `JsonLogger` in the "type" key below.
+        # "logger_config": {
+        #     "type": "ray.tune.logger.JsonLogger",
+        #     # Optional: Custom logdir (do not define this here
+        #     # for using ~/ray_results/...).
+        #     "logdir": "/tmp",
+        # },
+        # 3) Custom logger (see `MyPrintLogger` class above).
+        .debugging(
+            logger_config={
+                # Provide the class directly or via fully qualified class
+                # path.
+                "type": MyPrintLogger,
+                # `config` keys:
+                "prefix": "ABC",
+                # Optional: Custom logdir (do not define this here
+                # for using ~/ray_results/...).
+                # "logdir": "/somewhere/on/my/file/system/"
+            }
+        )
+    )
+    stop = {f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 200.0}
+    # Run the actual experiment (using Tune).
+    results = tune.Tuner(
+        config.algo_class,
+        param_space=config,
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=2,
+            # Plugin our own logger.
+            callbacks=[
+                LegacyLoggerCallback([MyPrintLogger]),
+            ],
+        ),
+    ).fit()

.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_progress_reporter.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""Example showing how to set up a custom progress reporter for an RLlib Algorithm.
+The script sets the `progress_reporter` arg in the air.RunConfig and passes that to
+Tune's Tuner:
+```
+tune.Tuner(
+    param_space=...,  # <- your RLlib config
+    run_config=air.RunConfig(
+        progress_reporter=[some already instantiated TuneReporterBase object],
+    ),
+)
+```
+By default (progress_reporter=None), Tune will construct a default `CLIReporter` object,
+which reports the episode mean return, number of env steps sampled and -trained, and
+the total number of episodes run thus far.
+NOTE that a custom progress reporter is different from a custom `Logger`, which defines,
+how the (frequent) results are being formatted and written to e.g. a logfile.
+To see an example on how to write your own Logger, see:
+https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_logger.py
+How to run this script
+----------------------
+`python [script file name].py
+Results to expect
+-----------------
+You should see something similar to the following in your console output:
++---------------------+------------+-----------------+--------+------------------+
+| Trial name          | status     | loc             |   iter |   total time (s) |
+|---------------------+------------+-----------------+--------+------------------+
+| PPO_env_bb503_00000 | TERMINATED | 127.0.0.1:26303 |      5 |          30.3823 |
++---------------------+------------+-----------------+--------+------------------+
++-------+-------------------+------------------+------------------+------------------+
+|    ts |   combined return |   return policy1 |   return policy2 |   return policy3 |
+|-------+-------------------+------------------+------------------+------------------|
+| 20000 |             258.7 |            103.4 |            88.84 |            87.86 |
++-------+-------------------+------------------+------------------+------------------+
+"""
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+my_multi_agent_progress_reporter = tune.CLIReporter(
+    # In the following dict, the keys are the (possibly nested) keys that can be found
+    # in RLlib's (PPO's) result dict, produced at every training iteration, and the
+    # values are the column names you would like to see in your console reports.
+    # Note that for nested result dict keys, you need to use slashes "/" to define the
+    # exact path.
+    metric_columns={
+        **{
+            TRAINING_ITERATION: "iter",
+            "time_total_s": "total time (s)",
+            NUM_ENV_STEPS_SAMPLED_LIFETIME: "ts",
+            # RLlib always sums up all agents' rewards and reports it under:
+            # result_dict[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN].
+            f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": "combined return",
+        },
+        # Because RLlib sums up all returns of all agents, we would like to also
+        # see the individual agents' returns. We can find these under the result dict's
+        # 'env_runners/module_episode_returns_mean/' key (then the policy ID):
+        **{
+            f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/{pid}": f"return {pid}"
+            for pid in ["policy1", "policy2", "policy3"]
+        },
+    },
+)
+if __name__ == "__main__":
+    # Force Tuner to use old progress output as the new one silently ignores our custom
+    # `CLIReporter`.
+    # TODO (sven): Find out why we require this hack.
+    import os
+    os.environ["RAY_AIR_NEW_OUTPUT"] = "0"
+    # Register our multi-agent env with a fixed number of agents.
+    # The agents' IDs are 0, 1, and 2.
+    tune.register_env("env", lambda _: MultiAgentCartPole({"num_agents": 3}))
+    config = (
+        PPOConfig()
+        .environment("env")
+        .multi_agent(
+            # Define 3 policies. Note that in our simple setup, they are all configured
+            # the exact same way (with a PPO default RLModule/NN).
+            policies={"policy1", "policy2", "policy3"},
+            # Map agent 0 to "policy1", etc..
+            policy_mapping_fn=lambda agent_id, episode: f"policy{agent_id + 1}",
+        )
+    )
+    stop = {f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 200.0}
+    # Run the actual experiment (using Tune).
+    results = tune.Tuner(
+        config.algo_class,
+        param_space=config,
+        run_config=air.RunConfig(
+            stop=stop,
+            verbose=2,
+            # Plugin our own progress reporter.
+            progress_reporter=my_multi_agent_progress_reporter,
+        ),
+    ).fit()

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/action_masking_rl_module.cpython-311.pyc ADDED Viewed

Binary file (5.32 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_cnn_rl_module.cpython-311.pyc ADDED Viewed

Binary file (5.22 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_lstm_rl_module.cpython-311.pyc ADDED Viewed

Binary file (4.49 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_config.cpython-311.pyc ADDED Viewed

Binary file (2.52 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.cpython-311.pyc ADDED Viewed

Binary file (4.44 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/pretraining_single_agent_training_multi_agent.cpython-311.pyc ADDED Viewed

Binary file (7.87 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/action_masking_rl_module.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""An example script showing how to define and load an `RLModule` that applies
+action masking
+This example:
+    - Defines an `RLModule` that applies action masking.
+    - It does so by using a `gymnasium.spaces.dict.Dict` observation space
+        with two keys, namely `"observations"`, holding the original observations
+        and `"action_mask"` defining the action mask for the current environment
+        state. Note, by this definition you can wrap any `gymnasium` environment
+        and use it for this module.
+    - Furthermore, it derives its `TorchRLModule` from the `PPOTorchRLModule` and
+        can therefore be easily plugged into our `PPO` algorithm.
+    - It overrides the `forward` methods of the `PPOTorchRLModule` to apply the
+        action masking and it overrides the `_compute_values` method for GAE
+        computation to extract the `"observations"` from the batch `Columns.OBS`
+        key.
+    - It uses the custom `ActionMaskEnv` that defines for each step a new action
+        mask that defines actions that are allowed (1.0) and others that are not
+        (0.0).
+    - It runs 10 iterations with PPO and finishes.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-env-runners 2`
+Control the number of `EnvRunner`s with the `--num-env-runners` flag. This
+will increase the sampling speed.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+You should expect a mean episode reward of around 0.35. The environment is a random
+environment paying out random rewards - so the agent cannot learn, but it can obey the
+action mask and should do so (no `AssertionError` should happen).
+After 40,000 environment steps and 10 training iterations the run should stop
+successfully:
++-------------------------------+------------+----------------------+--------+
+| Trial name                    | status     | loc                  |   iter |
+|                               |            |                      |        |
+|-------------------------------+------------+----------------------+--------+
+| PPO_ActionMaskEnv_dedc8_00000 | TERMINATED | 192.168.1.178:103298 |     10 |
++-------------------------------+------------+----------------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |   num_env_steps_sample |   num_env_steps_traine |
+|                  |             d_lifetime |             d_lifetime |
++------------------+------------------------+------------------------+
+|          57.9207 |                  40000 |                  40000 |
++------------------+------------------------+------------------------+
+*------------------------+
+|   num_episodes_lifetim |
+|                      e |
++------------------------|
+|                   3898 |
++------------------------+
+"""
+from gymnasium.spaces import Box, Discrete
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.action_mask_env import ActionMaskEnv
+from ray.rllib.examples.rl_modules.classes.action_masking_rlm import (
+    ActionMaskingTorchRLModule,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+parser = add_rllib_example_script_args(
+    default_iters=10,
+    default_timesteps=100000,
+    default_reward=150.0,
+)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.algo != "PPO":
+        raise ValueError("This example only supports PPO. Please use --algo=PPO.")
+    base_config = (
+        PPOConfig()
+        .environment(
+            env=ActionMaskEnv,
+            env_config={
+                "action_space": Discrete(100),
+                # This defines the 'original' observation space that is used in the
+                # `RLModule`. The environment will wrap this space into a
+                # `gym.spaces.Dict` together with an 'action_mask' that signals the
+                # `RLModule` to adapt the action distribution inputs for the underlying
+                # `DefaultPPORLModule`.
+                "observation_space": Box(-1.0, 1.0, (5,)),
+            },
+        )
+        .rl_module(
+            # We need to explicitly specify here RLModule to use and
+            # the catalog needed to build it.
+            rl_module_spec=RLModuleSpec(
+                module_class=ActionMaskingTorchRLModule,
+                model_config={
+                    "head_fcnet_hiddens": [64, 64],
+                    "head_fcnet_activation": "relu",
+                },
+            ),
+        )
+        .evaluation(
+            evaluation_num_env_runners=1,
+            evaluation_interval=1,
+            # Run evaluation parallel to training to speed up the example.
+            evaluation_parallel_to_training=True,
+        )
+    )
+    # Run the example (with Tune).
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ray.rllib.examples.rl_modules.classes.rock_paper_scissors_heuristic_rlm import (
+    AlwaysSameHeuristicRLM,
+    BeatLastHeuristicRLM,
+)
+__all__ = [
+    "AlwaysSameHeuristicRLM",
+    "BeatLastHeuristicRLM",
+]

.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (435 Bytes). View file