diff --git a/.gitattributes b/.gitattributes index ec645e70edaa1c0663dd5b1921f3bf52b115fb6d..db23df24a51b016a716ec6e56057a782e5fe1690 100644 --- a/.gitattributes +++ b/.gitattributes @@ -177,3 +177,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/ray/_private/__pycache__/test_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/_private/thirdparty/pynvml/__pycache__/pynvml.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e74256f183ede0d05050c27d51e7043e30735fa0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/__pycache__/algorithm_config.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c834df6257a1158af124427e411086f3fcc8eb2ea4c080f29143c4a418c67c +size 250369 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..861b915146949dbe03a74f7abf04d8cd0a809b9f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/vpg_custom_algorithm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/vpg_custom_algorithm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba41abdb5149b5306e94dd30eb87f92bc78326c0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/__pycache__/vpg_custom_algorithm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d81106dd0142bf183a063e9e536248cb6f79d8dc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/vpg.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/vpg.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e0b09b41b4b9a05b1a5f1f2d824e3a6eb641769 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/__pycache__/vpg.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/vpg.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/vpg.py new file mode 100644 index 0000000000000000000000000000000000000000..4a44122557f49ce900e13feb0161bbb065f6c655 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/classes/vpg.py @@ -0,0 +1,176 @@ +import tree # pip install dm_tree + +from ray.rllib.algorithms import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.utils.annotations import override +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + ENV_RUNNER_SAMPLING_TIMER, + LEARNER_RESULTS, + LEARNER_UPDATE_TIMER, + NUM_ENV_STEPS_SAMPLED_LIFETIME, + SYNCH_WORKER_WEIGHTS_TIMER, + TIMERS, +) + + +class VPGConfig(AlgorithmConfig): + """A simple VPG (vanilla policy gradient) algorithm w/o value function support. + + Use for testing purposes only! + + This Algorithm should use the VPGTorchLearner and VPGTorchRLModule + """ + + # A test setting to activate metrics on mean weights. + report_mean_weights: bool = True + + def __init__(self, algo_class=None): + super().__init__(algo_class=algo_class or VPG) + + # VPG specific settings. + self.num_episodes_per_train_batch = 10 + # Note that we don't have to set this here, because we tell the EnvRunners + # explicitly to sample entire episodes. However, for good measure, we change + # this setting here either way. + self.batch_mode = "complete_episodes" + + # VPG specific defaults (from AlgorithmConfig). + self.num_env_runners = 1 + + @override(AlgorithmConfig) + def training( + self, *, num_episodes_per_train_batch=NotProvided, **kwargs + ) -> "VPGConfig": + """Sets the training related configuration. + + Args: + num_episodes_per_train_batch: The number of complete episodes per train + batch. VPG requires entire episodes to be sampled from the EnvRunners. + For environments with varying episode lengths, this leads to varying + batch sizes (in timesteps) as well possibly causing slight learning + instabilities. However, for simplicity reasons, we stick to collecting + always exactly n episodes per training update. + + Returns: + This updated AlgorithmConfig object. + """ + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if num_episodes_per_train_batch is not NotProvided: + self.num_episodes_per_train_batch = num_episodes_per_train_batch + + return self + + @override(AlgorithmConfig) + def get_default_rl_module_spec(self): + if self.framework_str == "torch": + from ray.rllib.examples.rl_modules.classes.vpg_torch_rlm import ( + VPGTorchRLModule, + ) + + spec = RLModuleSpec( + module_class=VPGTorchRLModule, + model_config={"hidden_dim": 64}, + ) + else: + raise ValueError(f"Unsupported framework: {self.framework_str}") + + return spec + + @override(AlgorithmConfig) + def get_default_learner_class(self): + if self.framework_str == "torch": + from ray.rllib.examples.learners.classes.vpg_torch_learner import ( + VPGTorchLearner, + ) + + return VPGTorchLearner + else: + raise ValueError(f"Unsupported framework: {self.framework_str}") + + +class VPG(Algorithm): + @classmethod + @override(Algorithm) + def get_default_config(cls) -> AlgorithmConfig: + return VPGConfig() + + @override(Algorithm) + def training_step(self) -> None: + """Override of the training_step method of `Algorithm`. + + Runs the following steps per call: + - Sample B timesteps (B=train batch size). Note that we don't sample complete + episodes due to simplicity. For an actual VPG algo, due to the loss computation, + you should always sample only completed episodes. + - Send the collected episodes to the VPG LearnerGroup for model updating. + - Sync the weights from LearnerGroup to all EnvRunners. + """ + # Sample. + with self.metrics.log_time((TIMERS, ENV_RUNNER_SAMPLING_TIMER)): + episodes, env_runner_results = self._sample_episodes() + # Merge results from n parallel sample calls into self's metrics logger. + self.metrics.merge_and_log_n_dicts(env_runner_results, key=ENV_RUNNER_RESULTS) + + # Just for demonstration purposes, log the number of time steps sampled in this + # `training_step` round. + # Mean over a window of 100: + self.metrics.log_value( + "episode_timesteps_sampled_mean_win100", + sum(map(len, episodes)), + reduce="mean", + window=100, + ) + # Exponential Moving Average (EMA) with coeff=0.1: + self.metrics.log_value( + "episode_timesteps_sampled_ema", + sum(map(len, episodes)), + ema_coeff=0.1, # <- weight of new value; weight of old avg=1.0-ema_coeff + ) + + # Update model. + with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)): + learner_results = self.learner_group.update_from_episodes( + episodes=episodes, + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek( + (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME) + ) + ), + }, + ) + # Merge results from m parallel update calls into self's metrics logger. + self.metrics.merge_and_log_n_dicts(learner_results, key=LEARNER_RESULTS) + + # Sync weights. + with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)): + self.env_runner_group.sync_weights( + from_worker_or_learner_group=self.learner_group, + inference_only=True, + ) + + def _sample_episodes(self): + # How many episodes to sample from each EnvRunner? + num_episodes_per_env_runner = self.config.num_episodes_per_train_batch // ( + self.config.num_env_runners or 1 + ) + # Send parallel remote requests to sample and get the metrics. + sampled_data = self.env_runner_group.foreach_env_runner( + # Return tuple of [episodes], [metrics] from each EnvRunner. + lambda env_runner: ( + env_runner.sample(num_episodes=num_episodes_per_env_runner), + env_runner.get_metrics(), + ), + # Loop over remote EnvRunners' `sample()` method in parallel or use the + # local EnvRunner if there aren't any remote ones. + local_env_runner=self.env_runner_group.num_remote_workers() <= 0, + ) + # Return one list of episodes and a list of metrics dicts (one per EnvRunner). + episodes = tree.flatten([s[0] for s in sampled_data]) + stats_dicts = [s[1] for s in sampled_data] + + return episodes, stats_dicts diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/vpg_custom_algorithm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/vpg_custom_algorithm.py new file mode 100644 index 0000000000000000000000000000000000000000..9dbc259204b0a87c8a5bf818140f1c59c6995e1c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/algorithms/vpg_custom_algorithm.py @@ -0,0 +1,117 @@ +"""Example of how to write a custom Algorithm. + +This is an end-to-end example for how to implement a custom Algorithm, including +a matching AlgorithmConfig class and Learner class. There is no particular RLModule API +needed for this algorithm, which means that any TorchRLModule returning actions +or action distribution parameters suffices. + +The RK algorithm implemented here is "vanilla policy gradient" (VPG) in its simplest +form, without a value function baseline. + +See the actual VPG algorithm class here: +https://github.com/ray-project/ray/blob/master/rllib/examples/algorithms/classes/vpg.py + +The Learner class the algorithm uses by default (if the user doesn't specify a custom +Learner): +https://github.com/ray-project/ray/blob/master/rllib/examples/learners/classes/vpg_torch_learner.py # noqa + +And the RLModule class the algorithm uses by default (if the user doesn't specify a +custom RLModule): +https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/classes/vpg_torch_rlm.py # noqa + +This example shows: + - how to subclass the AlgorithmConfig base class to implement a custom algorithm's. + config class. + - how to subclass the Algorithm base class to implement a custom Algorithm, + including its `training_step` method. + - how to subclass the TorchLearner base class to implement a custom Learner with + loss function, overriding `compute_loss_for_module` and + `after_gradient_based_update`. + - how to define a default RLModule used by the algorithm in case the user + doesn't bring their own custom RLModule. The VPG algorithm doesn't require any + specific RLModule APIs, so any RLModule returning actions or action distribution + inputs suffices. + +We compute a plain policy gradient loss without value function baseline. +The experiment shows that even with such a simple setup, our custom algorithm is still +able to successfully learn CartPole-v1. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +With some fine-tuning of the learning rate, the batch size, and maybe the +number of env runners and number of envs per env runner, you should see decent +learning behavior on the CartPole-v1 environment: + ++-----------------------------+------------+--------+------------------+ +| Trial name | status | iter | total time (s) | +| | | | | +|-----------------------------+------------+--------+------------------+ +| VPG_CartPole-v1_2973e_00000 | TERMINATED | 451 | 59.5184 | ++-----------------------------+------------+--------+------------------+ ++-----------------------+------------------------+------------------------+ +| episode_return_mean | num_env_steps_sample | ...env_steps_sampled | +| | d_lifetime | _lifetime_throughput | +|-----------------------+------------------------+------------------------| +| 250.52 | 415787 | 7428.98 | ++-----------------------+------------------------+------------------------+ +""" + +from ray.rllib.examples.algorithms.classes.vpg import VPGConfig +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + + +parser = add_rllib_example_script_args( + default_reward=250.0, + default_iters=1000, + default_timesteps=750000, +) +parser.set_defaults(enable_new_api_stack=True) + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + VPGConfig() + .environment("CartPole-v1") + .training( + # The only VPG-specific setting. How many episodes per train batch? + num_episodes_per_train_batch=10, + # Set other config parameters. + lr=0.0005, + # Note that you don't have to set any specific Learner class, because + # our custom Algorithm already defines the default Learner class to use + # through its `get_default_learner_class` method, which returns + # `VPGTorchLearner`. + # learner_class=VPGTorchLearner, + ) + # Increase the number of EnvRunners (default is 1 for VPG) + # or the number of envs per EnvRunner. + .env_runners(num_env_runners=2, num_envs_per_env_runner=1) + # Plug in your own RLModule class. VPG doesn't require any specific + # RLModule APIs, so any RLModule returning `actions` or `action_dist_inputs` + # from the forward methods works ok. + # .rl_module( + # rl_module_spec=RLModuleSpec(module_class=...), + # ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/continue_training_from_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/continue_training_from_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e728590f4a21211ddbb8e239f728ee7225330025 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/continue_training_from_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_tf.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2e65bee88094283bec310249659ca46d183ce56 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_tf.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbccae42965609919847c0d7ae95f0ae2fb04f2b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch_lstm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch_lstm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c24d241918419ce991493879e7f8992a7320d6c0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/onnx_torch_lstm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/cartpole_dqn_export.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/cartpole_dqn_export.py new file mode 100644 index 0000000000000000000000000000000000000000..86a623d012d9a1493d9cdd5481ad91e1932eae9d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/cartpole_dqn_export.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +# @OldAPIStack + +import numpy as np +import os +import ray + +from ray.rllib.policy.policy import Policy +from ray.rllib.utils.framework import try_import_tf +from ray.tune.registry import get_trainable_cls + +tf1, tf, tfv = try_import_tf() + +ray.init() + + +def train_and_export_policy_and_model(algo_name, num_steps, model_dir, ckpt_dir): + cls = get_trainable_cls(algo_name) + config = cls.get_default_config() + config.api_stack( + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, + ) + # This Example is only for tf. + config.framework("tf") + # Set exporting native (DL-framework) model files to True. + config.export_native_model_files = True + config.env = "CartPole-v1" + alg = config.build() + for _ in range(num_steps): + alg.train() + + # Export Policy checkpoint. + alg.export_policy_checkpoint(ckpt_dir) + # Export tensorflow keras Model for online serving + alg.export_policy_model(model_dir) + + +def restore_saved_model(export_dir): + signature_key = ( + tf1.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY + ) + g = tf1.Graph() + with g.as_default(): + with tf1.Session(graph=g) as sess: + meta_graph_def = tf1.saved_model.load( + sess, [tf1.saved_model.tag_constants.SERVING], export_dir + ) + print("Model restored!") + print("Signature Def Information:") + print(meta_graph_def.signature_def[signature_key]) + print("You can inspect the model using TensorFlow SavedModel CLI.") + print("https://www.tensorflow.org/guide/saved_model") + + +def restore_policy_from_checkpoint(export_dir): + # Load the model from the checkpoint. + policy = Policy.from_checkpoint(export_dir) + # Perform a dummy (CartPole) forward pass. + test_obs = np.array([0.1, 0.2, 0.3, 0.4]) + results = policy.compute_single_action(test_obs) + # Check results for correctness. + assert len(results) == 3 + assert results[0].shape == () # pure single action (int) + assert results[1] == [] # RNN states + assert results[2]["action_dist_inputs"].shape == (2,) # categorical inputs + + +if __name__ == "__main__": + algo = "PPO" + model_dir = os.path.join(ray._private.utils.get_user_temp_dir(), "model_export_dir") + ckpt_dir = os.path.join(ray._private.utils.get_user_temp_dir(), "ckpt_export_dir") + num_steps = 1 + train_and_export_policy_and_model(algo, num_steps, model_dir, ckpt_dir) + restore_saved_model(model_dir) + restore_policy_from_checkpoint(ckpt_dir) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/change_config_during_training.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/change_config_during_training.py new file mode 100644 index 0000000000000000000000000000000000000000..f42ad6e79afbbf02336586687529d17fa1fa083b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/change_config_during_training.py @@ -0,0 +1,246 @@ +"""Example showing how to continue training an Algorithm with a changed config. + +Use the setup shown in this script if you want to continue a prior experiment, but +would also like to change some of the config values you originally used. + +This example: + - runs a single- or multi-agent CartPole experiment (for multi-agent, we use + different learning rates) thereby checkpointing the state of the Algorithm every n + iterations. The config used is hereafter called "1st config". + - stops the experiment due to some episode return being achieved. + - just for testing purposes, restores the entire algorithm from the latest + checkpoint and checks, whether the state of the restored algo exactly match the + state of the previously saved one. + - then changes the original config used (learning rate and other settings) and + continues training with the restored algorithm and the changed config until a + final episode return is reached. The new config is hereafter called "2nd config". + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=[0 or 2] +--stop-reward-first-config=[return at which the algo on 1st config should stop training] +--stop-reward=[the final return to achieve after restoration from the checkpoint with +the 2nd config] +` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +First, you should see the initial tune.Tuner do it's thing: + +Trial status: 1 RUNNING +Current time: 2024-06-03 12:03:39. Total running time: 30s +Logical resource usage: 3.0/12 CPUs, 0/0 GPUs +╭──────────────────────────────────────────────────────────────────────── +│ Trial name status iter total time (s) +├──────────────────────────────────────────────────────────────────────── +│ PPO_CartPole-v1_7b1eb_00000 RUNNING 6 16.265 +╰──────────────────────────────────────────────────────────────────────── +───────────────────────────────────────────────────────────────────────╮ +..._sampled_lifetime ..._trained_lifetime ...episodes_lifetime │ +───────────────────────────────────────────────────────────────────────┤ + 24000 24000 340 │ +───────────────────────────────────────────────────────────────────────╯ +... + +The experiment stops at an average episode return of `--stop-reward-first-config`. + +After the validation of the last checkpoint, a new experiment is started from +scratch, but with the RLlib callback restoring the Algorithm right after +initialization using the previous checkpoint. This new experiment then runs +until `--stop-reward` is reached. + +Trial status: 1 RUNNING +Current time: 2024-06-03 12:05:00. Total running time: 1min 0s +Logical resource usage: 3.0/12 CPUs, 0/0 GPUs +╭──────────────────────────────────────────────────────────────────────── +│ Trial name status iter total time (s) +├──────────────────────────────────────────────────────────────────────── +│ PPO_CartPole-v1_7b1eb_00000 RUNNING 23 14.8372 +╰──────────────────────────────────────────────────────────────────────── +───────────────────────────────────────────────────────────────────────╮ +..._sampled_lifetime ..._trained_lifetime ...episodes_lifetime │ +───────────────────────────────────────────────────────────────────────┤ + 109078 109078 531 │ +───────────────────────────────────────────────────────────────────────╯ + +And if you are using the `--as-test` option, you should see a finel message: + +``` +`env_runners/episode_return_mean` of 450.0 reached! ok +``` +""" +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.policy.policy import PolicySpec +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + LEARNER_RESULTS, +) +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + check, + run_rllib_example_script_experiment, +) +from ray.tune.registry import register_env + + +parser = add_rllib_example_script_args( + default_reward=450.0, default_timesteps=10000000, default_iters=2000 +) +parser.add_argument( + "--stop-reward-first-config", + type=float, + default=150.0, + help="Mean episode return after which the Algorithm on the first config should " + "stop training.", +) +# By default, set `args.checkpoint_freq` to 1 and `args.checkpoint_at_end` to True. +parser.set_defaults( + enable_new_api_stack=True, + checkpoint_freq=1, + checkpoint_at_end=True, +) + + +if __name__ == "__main__": + args = parser.parse_args() + + register_env( + "ma_cart", lambda cfg: MultiAgentCartPole({"num_agents": args.num_agents}) + ) + + # Simple generic config. + base_config = ( + PPOConfig() + .environment("CartPole-v1" if args.num_agents == 0 else "ma_cart") + .training(lr=0.0001) + # TODO (sven): Tune throws a weird error inside the "log json" callback + # when running with this option. The `perf` key in the result dict contains + # binary data (instead of just 2 float values for mem and cpu usage). + # .experimental(_use_msgpack_checkpoints=True) + ) + + # Setup multi-agent, if required. + if args.num_agents > 0: + base_config.multi_agent( + policies={ + f"p{aid}": PolicySpec( + config=AlgorithmConfig.overrides( + lr=5e-5 + * (aid + 1), # agent 1 has double the learning rate as 0. + ) + ) + for aid in range(args.num_agents) + }, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Define some stopping criterion. Note that this criterion is an avg episode return + # to be reached. + metric = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" + stop = {metric: args.stop_reward_first_config} + + tuner_results = run_rllib_example_script_experiment( + base_config, + args, + stop=stop, + keep_ray_up=True, + ) + + # Perform a very quick test to make sure our algo (upon restoration) did not lose + # its ability to perform well in the env. + # - Extract the best checkpoint. + best_result = tuner_results.get_best_result(metric=metric, mode="max") + assert ( + best_result.metrics[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + >= args.stop_reward_first_config + ) + best_checkpoint_path = best_result.checkpoint.path + + # Rebuild the algorithm (just for testing purposes). + test_algo = base_config.build() + # Load algo's state from the best checkpoint. + test_algo.restore_from_path(best_checkpoint_path) + # Perform some checks on the restored state. + assert test_algo.training_iteration > 0 + # Evaluate on the restored algorithm. + test_eval_results = test_algo.evaluate() + assert ( + test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + >= args.stop_reward_first_config + ), test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + # Train one iteration to make sure, the performance does not collapse (e.g. due + # to the optimizer weights not having been restored properly). + test_results = test_algo.train() + assert ( + test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + >= args.stop_reward_first_config + ), test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + # Stop the test algorithm again. + test_algo.stop() + + # Make sure the algorithm gets restored from a checkpoint right after + # initialization. Note that this includes all subcomponents of the algorithm, + # including the optimizer states in the LearnerGroup/Learner actors. + def on_algorithm_init(algorithm, **kwargs): + module_p0 = algorithm.get_module("p0") + weight_before = convert_to_numpy(next(iter(module_p0.parameters()))) + + algorithm.restore_from_path(best_checkpoint_path) + + # Make sure weights were restored (changed). + weight_after = convert_to_numpy(next(iter(module_p0.parameters()))) + check(weight_before, weight_after, false=True) + + # Change the config. + ( + base_config + # Make sure the algorithm gets restored upon initialization. + .callbacks(on_algorithm_init=on_algorithm_init) + # Change training parameters considerably. + .training( + lr=0.0003, + train_batch_size=5000, + grad_clip=100.0, + gamma=0.996, + num_epochs=6, + vf_loss_coeff=0.01, + ) + # Make multi-CPU/GPU. + .learners(num_learners=2) + # Use more env runners and more envs per env runner. + .env_runners(num_env_runners=3, num_envs_per_env_runner=5) + ) + + # Update the stopping criterium to the final target return per episode. + stop = {metric: args.stop_reward} + + # Run a new experiment with the (RLlib) callback `on_algorithm_init` restoring + # from the best checkpoint. + # Note that the new experiment starts again from iteration=0 (unlike when you + # use `tune.Tuner.restore()` after a crash or interrupted trial). + tuner_results = run_rllib_example_script_experiment(base_config, args, stop=stop) + + # Assert that we have continued training with a different learning rate. + assert ( + tuner_results[0].metrics[LEARNER_RESULTS][DEFAULT_MODULE_ID][ + "default_optimizer_learning_rate" + ] + == base_config.lr + == 0.0003 + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py new file mode 100644 index 0000000000000000000000000000000000000000..33204e52d5e94c601e27972b3c3c0ce2f5d5cb3a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py @@ -0,0 +1,146 @@ +"""Example extracting a checkpoint from n trials using one or more custom criteria. + +This example: + - runs a CartPole experiment with three different learning rates (three tune + "trials"). During the experiment, for each trial, we create a checkpoint at each + iteration. + - at the end of the experiment, we compare the trials and pick the one that + performed best, based on the criterion: Lowest episode count per single iteration + (for CartPole, a low episode count means the episodes are very long and thus the + reward is also very high). + - from that best trial (with the lowest episode count), we then pick those + checkpoints that a) have the lowest policy loss (good) and b) have the highest value + function loss (bad). + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see the performance of the three different learning +rates used here: + ++-----------------------------+------------+-----------------+--------+--------+ +| Trial name | status | loc | lr | iter | +|-----------------------------+------------+-----------------+--------+--------+ +| PPO_CartPole-v1_d7dbe_00000 | TERMINATED | 127.0.0.1:98487 | 0.01 | 17 | +| PPO_CartPole-v1_d7dbe_00001 | TERMINATED | 127.0.0.1:98488 | 0.001 | 8 | +| PPO_CartPole-v1_d7dbe_00002 | TERMINATED | 127.0.0.1:98489 | 0.0001 | 9 | ++-----------------------------+------------+-----------------+--------+--------+ + ++------------------+-------+----------+----------------------+----------------------+ +| total time (s) | ts | reward | episode_reward_max | episode_reward_min | +|------------------+-------+----------+----------------------+----------------------+ +| 28.1068 | 39797 | 151.11 | 500 | 12 | +| 13.304 | 18728 | 158.91 | 500 | 15 | +| 14.8848 | 21069 | 167.36 | 500 | 13 | ++------------------+-------+----------+----------------------+----------------------+ + ++--------------------+ +| episode_len_mean | +|--------------------| +| 151.11 | +| 158.91 | +| 167.36 | ++--------------------+ +""" + +from ray import tune +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + LEARNER_RESULTS, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_reward=450.0, default_timesteps=100000, default_iters=200 +) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Force-set `args.checkpoint_freq` to 1. + args.checkpoint_freq = 1 + + # Simple generic config. + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("CartPole-v1") + # Run 3 trials, each w/ a different learning rate. + .training(lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341) + ) + # Run tune for some iterations and generate checkpoints. + results = run_rllib_example_script_experiment(base_config, args) + + # Get the best of the 3 trials by using some metric. + # NOTE: Choosing the min `episodes_this_iter` automatically picks the trial + # with the best performance (over the entire run (scope="all")): + # The fewer episodes, the longer each episode lasted, the more reward we + # got each episode. + # Setting scope to "last", "last-5-avg", or "last-10-avg" will only compare + # (using `mode=min|max`) the average values of the last 1, 5, or 10 + # iterations with each other, respectively. + # Setting scope to "avg" will compare (using `mode`=min|max) the average + # values over the entire run. + metric = "env_runners/num_episodes" + # notice here `scope` is `all`, meaning for each trial, + # all results (not just the last one) will be examined. + best_result = results.get_best_result(metric=metric, mode="min", scope="all") + value_best_metric = best_result.metrics_dataframe[metric].min() + best_return_best = best_result.metrics_dataframe[ + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" + ].max() + print( + f"Best trial was the one with lr={best_result.metrics['config']['lr']}. " + f"Reached lowest episode count ({value_best_metric}) in a single iteration and " + f"an average return of {best_return_best}." + ) + + # Confirm, we picked the right trial. + + assert ( + value_best_metric + == results.get_dataframe(filter_metric=metric, filter_mode="min")[metric].min() + ) + + # Get the best checkpoints from the trial, based on different metrics. + # Checkpoint with the lowest policy loss value: + if args.enable_new_api_stack: + policy_loss_key = f"{LEARNER_RESULTS}/{DEFAULT_MODULE_ID}/policy_loss" + else: + policy_loss_key = "info/learner/default_policy/learner_stats/policy_loss" + best_result = results.get_best_result(metric=policy_loss_key, mode="min") + ckpt = best_result.checkpoint + lowest_policy_loss = best_result.metrics_dataframe[policy_loss_key].min() + print(f"Checkpoint w/ lowest policy loss ({lowest_policy_loss}): {ckpt}") + + # Checkpoint with the highest value-function loss: + if args.enable_new_api_stack: + vf_loss_key = f"{LEARNER_RESULTS}/{DEFAULT_MODULE_ID}/vf_loss" + else: + vf_loss_key = "info/learner/default_policy/learner_stats/vf_loss" + best_result = results.get_best_result(metric=vf_loss_key, mode="max") + ckpt = best_result.checkpoint + highest_value_fn_loss = best_result.metrics_dataframe[vf_loss_key].max() + print(f"Checkpoint w/ highest value function loss: {ckpt}") + print(f"Highest value function loss: {highest_value_fn_loss}") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/continue_training_from_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/continue_training_from_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..449489340de0db6c62d68601ec896f5560cc82be --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/continue_training_from_checkpoint.py @@ -0,0 +1,268 @@ +"""Example showing how to restore an Algorithm from a checkpoint and resume training. + +Use the setup shown in this script if your experiments tend to crash after some time, +and you would therefore like to make your setup more robust and fault-tolerant. + +This example: + - runs a single- or multi-agent CartPole experiment (for multi-agent, we use + different learning rates) thereby checkpointing the state of the Algorithm every n + iterations. + - stops the experiment due to an expected crash in the algorithm's main process + after a certain number of iterations. + - just for testing purposes, restores the entire algorithm from the latest + checkpoint and checks, whether the state of the restored algo exactly match the + state of the crashed one. + - then continues training with the restored algorithm until the desired final + episode return is reached. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=[0 or 2] +--stop-reward-crash=[the episode return after which the algo should crash] +--stop-reward=[the final episode return to achieve after(!) restoration from the +checkpoint] +` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +First, you should see the initial tune.Tuner do it's thing: + +Trial status: 1 RUNNING +Current time: 2024-06-03 12:03:39. Total running time: 30s +Logical resource usage: 3.0/12 CPUs, 0/0 GPUs +╭──────────────────────────────────────────────────────────────────────── +│ Trial name status iter total time (s) +├──────────────────────────────────────────────────────────────────────── +│ PPO_CartPole-v1_7b1eb_00000 RUNNING 6 15.362 +╰──────────────────────────────────────────────────────────────────────── +───────────────────────────────────────────────────────────────────────╮ +..._sampled_lifetime ..._trained_lifetime ...episodes_lifetime │ +───────────────────────────────────────────────────────────────────────┤ + 24000 24000 340 │ +───────────────────────────────────────────────────────────────────────╯ +... + +then, you should see the experiment crashing as soon as the `--stop-reward-crash` +has been reached: + +```RuntimeError: Intended crash after reaching trigger return.``` + +At some point, the experiment should resume exactly where it left off (using +the checkpoint and restored Tuner): + +Trial status: 1 RUNNING +Current time: 2024-06-03 12:05:00. Total running time: 1min 0s +Logical resource usage: 3.0/12 CPUs, 0/0 GPUs +╭──────────────────────────────────────────────────────────────────────── +│ Trial name status iter total time (s) +├──────────────────────────────────────────────────────────────────────── +│ PPO_CartPole-v1_7b1eb_00000 RUNNING 27 66.1451 +╰──────────────────────────────────────────────────────────────────────── +───────────────────────────────────────────────────────────────────────╮ +..._sampled_lifetime ..._trained_lifetime ...episodes_lifetime │ +───────────────────────────────────────────────────────────────────────┤ + 108000 108000 531 │ +───────────────────────────────────────────────────────────────────────╯ + +And if you are using the `--as-test` option, you should see a finel message: + +``` +`env_runners/episode_return_mean` of 500.0 reached! ok +``` +""" +import re +import time + +from ray import train, tune +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.policy.policy import PolicySpec +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + check_learning_achieved, +) +from ray.tune.registry import get_trainable_cls, register_env +from ray.air.integrations.wandb import WandbLoggerCallback + + +parser = add_rllib_example_script_args( + default_reward=500.0, default_timesteps=10000000, default_iters=2000 +) +parser.add_argument( + "--stop-reward-crash", + type=float, + default=200.0, + help="Mean episode return after which the Algorithm should crash.", +) +# By default, set `args.checkpoint_freq` to 1 and `args.checkpoint_at_end` to True. +parser.set_defaults(checkpoint_freq=1, checkpoint_at_end=True) + + +class CrashAfterNIters(RLlibCallback): + """Callback that makes the algo crash after a certain avg. return is reached.""" + + def __init__(self): + super().__init__() + # We have to delay crashing by one iteration just so the checkpoint still + # gets created by Tune after(!) we have reached the trigger avg. return. + self._should_crash = False + + def on_train_result(self, *, algorithm, metrics_logger, result, **kwargs): + # We had already reached the mean-return to crash, the last checkpoint written + # (the one from the previous iteration) should yield that exact avg. return. + if self._should_crash: + raise RuntimeError("Intended crash after reaching trigger return.") + # Reached crashing criterion, crash on next iteration. + elif result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= args.stop_reward_crash: + print( + "Reached trigger return of " + f"{result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}" + ) + self._should_crash = True + + +if __name__ == "__main__": + args = parser.parse_args() + + register_env( + "ma_cart", lambda cfg: MultiAgentCartPole({"num_agents": args.num_agents}) + ) + + # Simple generic config. + config = ( + get_trainable_cls(args.algo) + .get_default_config() + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, + ) + .environment("CartPole-v1" if args.num_agents == 0 else "ma_cart") + .env_runners(create_env_on_local_worker=True) + .training(lr=0.0001) + .callbacks(CrashAfterNIters) + ) + + # Tune config. + # Need a WandB callback? + tune_callbacks = [] + if args.wandb_key: + project = args.wandb_project or ( + args.algo.lower() + "-" + re.sub("\\W+", "-", str(config.env).lower()) + ) + tune_callbacks.append( + WandbLoggerCallback( + api_key=args.wandb_key, + project=args.wandb_project, + upload_checkpoints=False, + **({"name": args.wandb_run_name} if args.wandb_run_name else {}), + ) + ) + + # Setup multi-agent, if required. + if args.num_agents > 0: + config.multi_agent( + policies={ + f"p{aid}": PolicySpec( + config=AlgorithmConfig.overrides( + lr=5e-5 + * (aid + 1), # agent 1 has double the learning rate as 0. + ) + ) + for aid in range(args.num_agents) + }, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Define some stopping criterion. Note that this criterion is an avg episode return + # to be reached. The stop criterion does not consider the built-in crash we are + # triggering through our callback. + stop = { + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + # Run tune for some iterations and generate checkpoints. + tuner = tune.Tuner( + trainable=config.algo_class, + param_space=config, + run_config=train.RunConfig( + callbacks=tune_callbacks, + checkpoint_config=train.CheckpointConfig( + checkpoint_frequency=args.checkpoint_freq, + checkpoint_at_end=args.checkpoint_at_end, + ), + stop=stop, + ), + ) + tuner_results = tuner.fit() + + # Perform a very quick test to make sure our algo (upon restoration) did not lose + # its ability to perform well in the env. + # - Extract the best checkpoint. + metric = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" + best_result = tuner_results.get_best_result(metric=metric, mode="max") + assert ( + best_result.metrics[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + >= args.stop_reward_crash + ) + # - Change our config, such that the restored algo will have an env on the local + # EnvRunner (to perform evaluation) and won't crash anymore (remove the crashing + # callback). + config.callbacks(None) + # Rebuild the algorithm (just for testing purposes). + test_algo = config.build() + # Load algo's state from best checkpoint. + test_algo.restore(best_result.checkpoint) + # Perform some checks on the restored state. + assert test_algo.training_iteration > 0 + # Evaluate on the restored algorithm. + test_eval_results = test_algo.evaluate() + assert ( + test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + >= args.stop_reward_crash + ), test_eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + # Train one iteration to make sure, the performance does not collapse (e.g. due + # to the optimizer weights not having been restored properly). + test_results = test_algo.train() + assert ( + test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= args.stop_reward_crash + ), test_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + # Stop the test algorithm again. + test_algo.stop() + + # Create a new Tuner from the existing experiment path (which contains the tuner's + # own checkpoint file). Note that even the WandB logging will be continued without + # creating a new WandB run name. + restored_tuner = tune.Tuner.restore( + path=tuner_results.experiment_path, + trainable=config.algo_class, + param_space=config, + # Important to set this to True b/c the previous trial had failed (due to our + # `CrashAfterNIters` callback). + resume_errored=True, + ) + # Continue the experiment exactly where we left off. + tuner_results = restored_tuner.fit() + + # Not sure, whether this is really necessary, but we have observed the WandB + # logger sometimes not logging some of the last iterations. This sleep here might + # give it enough time to do so. + time.sleep(20) + + if args.as_test: + check_learning_achieved(tuner_results, args.stop_reward, metric=metric) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_tf.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..19fb7f3760328985de89edba7aee837cd569895f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_tf.py @@ -0,0 +1,91 @@ +# @OldAPIStack +import argparse +import numpy as np +import onnxruntime +import os +import shutil + +import ray +import ray.rllib.algorithms.ppo as ppo + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--framework", + choices=["tf", "tf2"], + default="tf2", + help="The TF framework specifier (either 'tf' or 'tf2').", +) + + +if __name__ == "__main__": + + args = parser.parse_args() + + # Configure our PPO Algorithm. + config = ( + ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .env_runners(num_env_runners=1) + .framework(args.framework) + ) + + outdir = "export_tf" + if os.path.exists(outdir): + shutil.rmtree(outdir) + + np.random.seed(1234) + + # We will run inference with this test batch + test_data = { + "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32), + } + + # Start Ray and initialize a PPO Algorithm + ray.init() + algo = config.build(env="CartPole-v1") + + # You could train the model here via: + # algo.train() + + # Let's run inference on the tensorflow model + policy = algo.get_policy() + result_tf, _ = policy.model(test_data) + + # Evaluate tensor to fetch numpy array. + if args.framework == "tf": + with policy.get_session().as_default(): + result_tf = result_tf.eval() + + # This line will export the model to ONNX. + policy.export_model(outdir, onnx=11) + # Equivalent to: + # algo.export_policy_model(outdir, onnx=11) + + # Import ONNX model. + exported_model_file = os.path.join(outdir, "model.onnx") + + # Start an inference session for the ONNX model + session = onnxruntime.InferenceSession(exported_model_file, None) + + # Pass the same test batch to the ONNX model (rename to match tensor names) + onnx_test_data = {f"default_policy/{k}:0": v for k, v in test_data.items()} + + # Tf2 model stored differently from tf (static graph) model. + if args.framework == "tf2": + result_onnx = session.run(["fc_out"], {"observations": test_data["obs"]}) + else: + result_onnx = session.run( + ["default_policy/model/fc_out/BiasAdd:0"], + onnx_test_data, + ) + + # These results should be equal! + print("TENSORFLOW", result_tf) + print("ONNX", result_onnx) + + assert np.allclose(result_tf, result_onnx), "Model outputs are NOT equal. FAILED" + print("Model outputs are equal. PASSED") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..b7d39cc9225a79f73e9077c9207ed2f39237068d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch.py @@ -0,0 +1,79 @@ +# @OldAPIStack + +from packaging.version import Version +import numpy as np +import ray +import ray.rllib.algorithms.ppo as ppo +import onnxruntime +import os +import shutil +import torch + +if __name__ == "__main__": + # Configure our PPO Algorithm. + config = ( + ppo.PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .env_runners(num_env_runners=1) + .framework("torch") + ) + + outdir = "export_torch" + if os.path.exists(outdir): + shutil.rmtree(outdir) + + np.random.seed(1234) + + # We will run inference with this test batch + test_data = { + "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32), + "state_ins": np.array([0.0], dtype=np.float32), + } + + # Start Ray and initialize a PPO Algorithm. + ray.init() + algo = config.build(env="CartPole-v1") + + # You could train the model here + # algo.train() + + # Let's run inference on the torch model + policy = algo.get_policy() + result_pytorch, _ = policy.model( + { + "obs": torch.tensor(test_data["obs"]), + } + ) + + # Evaluate tensor to fetch numpy array + result_pytorch = result_pytorch.detach().numpy() + + # This line will export the model to ONNX. + policy.export_model(outdir, onnx=11) + # Equivalent to: + # algo.export_policy_model(outdir, onnx=11) + + # Import ONNX model. + exported_model_file = os.path.join(outdir, "model.onnx") + + # Start an inference session for the ONNX model + session = onnxruntime.InferenceSession(exported_model_file, None) + + # Pass the same test batch to the ONNX model + if Version(torch.__version__) < Version("1.9.0"): + # In torch < 1.9.0 the second input/output name gets mixed up + test_data["state_outs"] = test_data.pop("state_ins") + + result_onnx = session.run(["output"], test_data) + + # These results should be equal! + print("PYTORCH", result_pytorch) + print("ONNX", result_onnx) + + assert np.allclose( + result_pytorch, result_onnx + ), "Model outputs are NOT equal. FAILED" + print("Model outputs are equal. PASSED") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch_lstm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..d95a282a3a30d036afe39573850661c860df2740 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/onnx_torch_lstm.py @@ -0,0 +1,136 @@ +# @OldAPIStack + +import numpy as np +import onnxruntime + +import ray +import ray.rllib.algorithms.ppo as ppo +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import add_rllib_example_script_args, check +from ray.rllib.utils.torch_utils import convert_to_torch_tensor + +torch, _ = try_import_torch() + +parser = add_rllib_example_script_args() +parser.set_defaults(num_env_runners=1) + + +class ONNXCompatibleWrapper(torch.nn.Module): + def __init__(self, original_model): + super(ONNXCompatibleWrapper, self).__init__() + self.original_model = original_model + + def forward(self, a, b0, b1, c): + # Convert the separate tensor inputs back into the list format + # expected by the original model's forward method. + b = [b0, b1] + ret = self.original_model({"obs": a}, b, c) + # results, state_out_0, state_out_1 + return ret[0], ret[1][0], ret[1][1] + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + not args.enable_new_api_stack + ), "Must NOT set --enable-new-api-stack when running this script!" + + ray.init(local_mode=args.local_mode) + + # Configure our PPO Algorithm. + config = ( + ppo.PPOConfig() + # ONNX is not supported by RLModule API yet. + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, + ) + .environment("CartPole-v1") + .env_runners(num_env_runners=args.num_env_runners) + .training(model={"use_lstm": True}) + ) + + B = 3 + T = 5 + LSTM_CELL = 256 + + # Input data for a python inference forward call. + test_data_python = { + "obs": np.random.uniform(0, 1.0, size=(B * T, 4)).astype(np.float32), + "state_ins": [ + np.random.uniform(0, 1.0, size=(B, LSTM_CELL)).astype(np.float32), + np.random.uniform(0, 1.0, size=(B, LSTM_CELL)).astype(np.float32), + ], + "seq_lens": np.array([T] * B, np.float32), + } + # Input data for the ONNX session. + test_data_onnx = { + "obs": test_data_python["obs"], + "state_in_0": test_data_python["state_ins"][0], + "state_in_1": test_data_python["state_ins"][1], + "seq_lens": test_data_python["seq_lens"], + } + + # Input data for compiling the ONNX model. + test_data_onnx_input = convert_to_torch_tensor(test_data_onnx) + + # Initialize a PPO Algorithm. + algo = config.build() + + # You could train the model here + # algo.train() + + # Let's run inference on the torch model + policy = algo.get_policy() + result_pytorch, _ = policy.model( + { + "obs": torch.tensor(test_data_python["obs"]), + }, + [ + torch.tensor(test_data_python["state_ins"][0]), + torch.tensor(test_data_python["state_ins"][1]), + ], + torch.tensor(test_data_python["seq_lens"]), + ) + + # Evaluate tensor to fetch numpy array + result_pytorch = result_pytorch.detach().numpy() + + # Wrap the actual ModelV2 with the torch wrapper above to make this all work with + # LSTMs (extra `state` in- and outputs and `seq_lens` inputs). + onnx_compatible = ONNXCompatibleWrapper(policy.model) + exported_model_file = "model.onnx" + input_names = [ + "obs", + "state_in_0", + "state_in_1", + "seq_lens", + ] + + # This line will export the model to ONNX. + torch.onnx.export( + onnx_compatible, + tuple(test_data_onnx_input[n] for n in input_names), + exported_model_file, + export_params=True, + opset_version=11, + do_constant_folding=True, + input_names=input_names, + output_names=[ + "output", + "state_out_0", + "state_out_1", + ], + dynamic_axes={k: {0: "batch_size"} for k in input_names}, + ) + # Start an inference session for the ONNX model. + session = onnxruntime.InferenceSession(exported_model_file, None) + result_onnx = session.run(["output"], test_data_onnx) + + # These results should be equal! + print("PYTORCH", result_pytorch) + print("ONNX", result_onnx[0]) + + check(result_pytorch, result_onnx[0]) + print("Model outputs are equal. PASSED") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..acbbb83118943eecd1e897a572fbe5b8d9267c68 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py @@ -0,0 +1,171 @@ +"""Example demonstrating how to load module weights for 1 of n agents from a checkpoint. + +This example: + - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies, p0, p1, etc.. + - Saves a checkpoint of the `MultiRLModule` every `--checkpoint-freq` + iterations. + - Stops the experiments after the agents reach a combined return of -800. + - Picks the best checkpoint by combined return and restores p0 from it. + - Runs a second experiment with the restored `RLModule` for p0 and + a fresh `RLModule` for the other policies. + - Stops the second experiment after the agents reach a combined return of -800. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2 +--checkpoint-freq=20 --checkpoint-at-end` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +Control the number of checkpoints by setting `--checkpoint-freq` to a value > 0. +Note that the checkpoint frequency is per iteration and this example needs at +least a single checkpoint to load the RLModule weights for policy 0. +If `--checkpoint-at-end` is set, a checkpoint will be saved at the end of the +experiment. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should expect a reward of -400.0 eventually being achieved by a simple +single PPO policy. In the second run of the experiment, the MultiRLModule weights +for policy 0 are restored from the checkpoint of the first run. The reward for a +single agent should be -400.0 again, but the training time should be shorter +(around 30 iterations instead of 190) due to the fact that one policy is already +an expert from the get go. +""" + +from pathlib import Path + +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.callbacks import DefaultCallbacks +from ray.rllib.core import ( + COMPONENT_LEARNER, + COMPONENT_LEARNER_GROUP, + COMPONENT_RL_MODULE, +) +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + check, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +parser = add_rllib_example_script_args( + # Pendulum-v1 sum of 2 agents (each agent reaches -250). + default_reward=-500.0, +) +parser.set_defaults( + enable_new_api_stack=True, + checkpoint_freq=1, + num_agents=2, +) +# TODO (sven): This arg is currently ignored (hard-set to 2). +parser.add_argument("--num-policies", type=int, default=2) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Register our environment with tune. + if args.num_agents > 1: + register_env( + "env", + lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}), + ) + else: + raise ValueError( + f"`num_agents` must be > 1, but is {args.num_agents}." + "Read the script docstring for more information." + ) + + assert args.checkpoint_freq > 0, ( + "This example requires at least one checkpoint to load the RLModule " + "weights for policy 0." + ) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env") + .training( + train_batch_size_per_learner=512, + minibatch_size=64, + lambda_=0.1, + gamma=0.95, + lr=0.0003, + vf_clip_param=10.0, + ) + .rl_module( + model_config=DefaultModelConfig(fcnet_activation="relu"), + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Augment the base config with further settings and train the agents. + results = run_rllib_example_script_experiment(base_config, args, keep_ray_up=True) + + # Now swap in the RLModule weights for policy 0. + chkpt_path = results.get_best_result().checkpoint.path + p_0_module_state_path = ( + Path(chkpt_path) # <- algorithm's checkpoint dir + / COMPONENT_LEARNER_GROUP # <- learner group + / COMPONENT_LEARNER # <- learner + / COMPONENT_RL_MODULE # <- MultiRLModule + / "p0" # <- (single) RLModule + ) + + class LoadP0OnAlgoInitCallback(DefaultCallbacks): + def on_algorithm_init(self, *, algorithm, **kwargs): + module_p0 = algorithm.get_module("p0") + weight_before = convert_to_numpy(next(iter(module_p0.parameters()))) + algorithm.restore_from_path( + p_0_module_state_path, + component=( + COMPONENT_LEARNER_GROUP + + "/" + + COMPONENT_LEARNER + + "/" + + COMPONENT_RL_MODULE + + "/p0" + ), + ) + # Make sure weights were updated. + weight_after = convert_to_numpy(next(iter(module_p0.parameters()))) + check(weight_before, weight_after, false=True) + + base_config.callbacks(LoadP0OnAlgoInitCallback) + + # Define stopping criteria. + stop = { + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -800.0, + f"{ENV_RUNNER_RESULTS}/{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 100000, + TRAINING_ITERATION: 100, + } + + # Run the experiment again with the restored MultiRLModule. + run_rllib_example_script_experiment(base_config, args, stop=stop) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f741fec764ac0a65189274a67110b97e3c7f0eb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/count_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/count_based_curiosity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ff2c5d036f2d23219a61e2689ef3ff72bf7e642 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/count_based_curiosity.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08b5f8e5066e1b0bdbe5646796137a8af101128c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/intrinsic_curiosity_model_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/intrinsic_curiosity_model_based_curiosity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0307f631429891c01a1a357689fdb190fafb5e2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/__pycache__/intrinsic_curiosity_model_based_curiosity.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/count_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/count_based_curiosity.py new file mode 100644 index 0000000000000000000000000000000000000000..f7d1aa6e15530b221dfe8b53448ed611178faf90 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/count_based_curiosity.py @@ -0,0 +1,137 @@ +"""Example of using a count-based curiosity mechanism to learn in sparse-rewards envs. + +This example: + - demonstrates how to define your own count-based curiosity ConnectorV2 piece + that computes intrinsic rewards based on simple observation counts and adds these + intrinsic rewards to the "main" (extrinsic) rewards. + - shows how this connector piece overrides the main (extrinsic) rewards in the + episode and thus demonstrates how to do reward shaping in general with RLlib. + - shows how to plug this connector piece into your algorithm's config. + - uses Tune and RLlib to learn the env described above and compares 2 + algorithms, one that does use curiosity vs one that does not. + +We use a FrozenLake (sparse reward) environment with a map size of 8x8 and a time step +limit of 14 to make it almost impossible for a non-curiosity based policy to learn. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Use the `--no-curiosity` flag to disable curiosity learning and force your policy +to be trained on the task w/o the use of intrinsic rewards. With this option, the +algorithm should NOT succeed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see that only a PPO policy that uses curiosity can +actually learn. + +Policy using count-based curiosity: ++-------------------------------+------------+--------+------------------+ +| Trial name | status | iter | total time (s) | +| | | | | +|-------------------------------+------------+--------+------------------+ +| PPO_FrozenLake-v1_109de_00000 | TERMINATED | 48 | 44.46 | ++-------------------------------+------------+--------+------------------+ ++------------------------+-------------------------+------------------------+ +| episode_return_mean | num_episodes_lifetime | num_env_steps_traine | +| | | d_lifetime | +|------------------------+-------------------------+------------------------| +| 0.99 | 12960 | 194000 | ++------------------------+-------------------------+------------------------+ + +Policy NOT using curiosity: +[DOES NOT LEARN AT ALL] +""" +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.connectors.classes.count_based_curiosity import ( + CountBasedCuriosity, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_reward=0.99, default_iters=200, default_timesteps=1000000 +) +parser.set_defaults(enable_new_api_stack=True) +parser.add_argument( + "--intrinsic-reward-coeff", + type=float, + default=1.0, + help="The weight with which to multiply intrinsic rewards before adding them to " + "the extrinsic ones (default is 1.0).", +) +parser.add_argument( + "--no-curiosity", + action="store_true", + help="Whether to NOT use count-based curiosity.", +) + +ENV_OPTIONS = { + "is_slippery": False, + # Use this hard-to-solve 8x8 map with lots of holes (H) to fall into and only very + # few valid paths from the starting state (S) to the goal state (G). + "desc": [ + "SFFHFFFH", + "FFFHFFFF", + "FFFHHFFF", + "FFFFFFFH", + "HFFHFFFF", + "HHFHFFHF", + "FFFHFHHF", + "FHFFFFFG", + ], + # Limit the number of steps the agent is allowed to make in the env to + # make it almost impossible to learn without (count-based) curiosity. + "max_episode_steps": 14, +} + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + "FrozenLake-v1", + env_config=ENV_OPTIONS, + ) + .env_runners( + num_envs_per_env_runner=5, + # Flatten discrete observations (into one-hot vectors). + env_to_module_connector=lambda env: FlattenObservations(), + ) + .training( + # The main code in this example: We add the `CountBasedCuriosity` connector + # piece to our Learner connector pipeline. + # This pipeline is fed with collected episodes (either directly from the + # EnvRunners in on-policy fashion or from a replay buffer) and converts + # these episodes into the final train batch. The added piece computes + # intrinsic rewards based on simple observation counts and add them to + # the "main" (extrinsic) rewards. + learner_connector=( + None if args.no_curiosity else lambda *ags, **kw: CountBasedCuriosity() + ), + num_epochs=10, + vf_loss_coeff=0.01, + ) + .rl_module(model_config=DefaultModelConfig(vf_share_layers=True)) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/euclidian_distance_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/euclidian_distance_based_curiosity.py new file mode 100644 index 0000000000000000000000000000000000000000..d471c17f18587c5a49f6c20518b0c1bfe9a05797 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/euclidian_distance_based_curiosity.py @@ -0,0 +1,127 @@ +"""Example of a euclidian-distance curiosity mechanism to learn in sparse-rewards envs. + +This example: + - demonstrates how to define your own euclidian-distance-based curiosity ConnectorV2 + piece that computes intrinsic rewards based on the delta between incoming + observations and some set of already stored (prior) observations. Thereby, the + further away the incoming observation is from the already stored ones, the higher + its corresponding intrinsic reward. + - shows how this connector piece adds the intrinsic reward to the corresponding + "main" (extrinsic) reward and overrides the value in the "rewards" key in the + episode. It thus demonstrates how to do reward shaping in general with RLlib. + - shows how to plug this connector piece into your algorithm's config. + - uses Tune and RLlib to learn the env described above and compares 2 + algorithms, one that does use curiosity vs one that does not. + +We use the MountainCar-v0 environment, a sparse-reward env that is very hard to learn +for a regular PPO algorithm. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Use the `--no-curiosity` flag to disable curiosity learning and force your policy +to be trained on the task w/o the use of intrinsic rewards. With this option, the +algorithm should NOT succeed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see that only a PPO policy that uses curiosity can +actually learn. + +Policy using count-based curiosity: ++-------------------------------+------------+--------+------------------+ +| Trial name | status | iter | total time (s) | +| | | | | +|-------------------------------+------------+--------+------------------+ +| PPO_FrozenLake-v1_109de_00000 | TERMINATED | 48 | 44.46 | ++-------------------------------+------------+--------+------------------+ ++------------------------+-------------------------+------------------------+ +| episode_return_mean | num_episodes_lifetime | num_env_steps_traine | +| | | d_lifetime | +|------------------------+-------------------------+------------------------| +| 0.99 | 12960 | 194000 | ++------------------------+-------------------------+------------------------+ + +Policy NOT using curiosity: +[DOES NOT LEARN AT ALL] +""" +from ray.rllib.connectors.env_to_module import MeanStdFilter +from ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity import ( + EuclidianDistanceBasedCuriosity, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +# TODO (sven): SB3's PPO learns MountainCar-v0 until a reward of ~-110. +# We might have to play around some more with different initializations, etc.. +# to get to these results as well. +parser = add_rllib_example_script_args( + default_reward=-140.0, default_iters=2000, default_timesteps=1000000 +) +parser.set_defaults( + enable_new_api_stack=True, + num_env_runners=4, +) +parser.add_argument( + "--intrinsic-reward-coeff", + type=float, + default=0.0001, + help="The weight with which to multiply intrinsic rewards before adding them to " + "the extrinsic ones (default is 0.0001).", +) +parser.add_argument( + "--no-curiosity", + action="store_true", + help="Whether to NOT use count-based curiosity.", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("MountainCar-v0") + .env_runners( + env_to_module_connector=lambda env: MeanStdFilter(), + num_envs_per_env_runner=5, + ) + .training( + # The main code in this example: We add the + # `EuclidianDistanceBasedCuriosity` connector piece to our Learner connector + # pipeline. This pipeline is fed with collected episodes (either directly + # from the EnvRunners in on-policy fashion or from a replay buffer) and + # converts these episodes into the final train batch. The added piece + # computes intrinsic rewards based on simple observation counts and add them + # to the "main" (extrinsic) rewards. + learner_connector=( + None + if args.no_curiosity + else lambda *ags, **kw: EuclidianDistanceBasedCuriosity() + ), + # train_batch_size_per_learner=512, + grad_clip=20.0, + entropy_coeff=0.003, + gamma=0.99, + lr=0.0002, + lambda_=0.98, + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py new file mode 100644 index 0000000000000000000000000000000000000000..e482147b9dcf17a8a6142e3106ec1684bf08590c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/curiosity/intrinsic_curiosity_model_based_curiosity.py @@ -0,0 +1,313 @@ +"""Example of implementing and training with an intrinsic curiosity model (ICM). + +This type of curiosity-based learning trains a simplified model of the environment +dynamics based on three networks: +1) Embedding observations into latent space ("feature" network). +2) Predicting the action, given two consecutive embedded observations +("inverse" network). +3) Predicting the next embedded obs, given an obs and action +("forward" network). + +The less the ICM is able to predict the actually observed next feature vector, +given obs and action (through the forwards network), the larger the +"intrinsic reward", which will be added to the extrinsic reward of the agent. + +Therefore, if a state transition was unexpected, the agent becomes +"curious" and will further explore this transition leading to better +exploration in sparse rewards environments. + +For more details, see here: +[1] Curiosity-driven Exploration by Self-supervised Prediction +Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017. +https://arxiv.org/pdf/1705.05363.pdf + +This example: + - demonstrates how to write a custom RLModule, representing the ICM from the paper + above. Note that this custom RLModule does not belong to any individual agent. + - demonstrates how to write a custom (PPO) TorchLearner that a) adds the ICM to its + MultiRLModule, b) trains the regular PPO Policy plus the ICM module, using the + PPO parent loss and the ICM's RLModule's own loss function. + +We use a FrozenLake (sparse reward) environment with a custom map size of 12x12 and a +hard time step limit of 22 to make it almost impossible for a non-curiosity based +learners to learn a good policy. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Use the `--no-curiosity` flag to disable curiosity learning and force your policy +to be trained on the task w/o the use of intrinsic rewards. With this option, the +algorithm should NOT succeed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see that only a PPO policy that uses curiosity can +actually learn. + +Policy using ICM-based curiosity: ++-------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|-------------------------------+------------+-----------------+--------+ +| PPO_FrozenLake-v1_52ab2_00000 | TERMINATED | 127.0.0.1:73318 | 392 | ++-------------------------------+------------+-----------------+--------+ ++------------------+--------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+--------+----------+--------------------| +| 236.652 | 786000 | 1.0 | 22.0 | ++------------------+--------+----------+--------------------+ + +Policy NOT using curiosity: +[DOES NOT LEARN AT ALL] +""" +from collections import defaultdict + +import numpy as np + +from ray import tune +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.examples.learners.classes.intrinsic_curiosity_learners import ( + DQNTorchLearnerWithCuriosity, + PPOTorchLearnerWithCuriosity, +) +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.learners.classes.intrinsic_curiosity_learners import ( + ICM_MODULE_ID, +) +from ray.rllib.examples.rl_modules.classes.intrinsic_curiosity_model_rlm import ( + IntrinsicCuriosityModel, +) +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + +parser = add_rllib_example_script_args( + default_iters=2000, + default_timesteps=10000000, + default_reward=0.9, +) +parser.set_defaults(enable_new_api_stack=True) + + +class MeasureMaxDistanceToStart(RLlibCallback): + """Callback measuring the dist of the agent to its start position in FrozenLake-v1. + + Makes the naive assumption that the start position ("S") is in the upper left + corner of the used map. + Uses the MetricsLogger to record the (euclidian) distance value. + """ + + def __init__(self): + super().__init__() + self.max_dists = defaultdict(float) + self.max_dists_lifetime = 0.0 + + def on_episode_step( + self, + *, + episode, + env_runner, + metrics_logger, + env, + env_index, + rl_module, + **kwargs, + ): + num_rows = env.envs[0].unwrapped.nrow + num_cols = env.envs[0].unwrapped.ncol + obs = np.argmax(episode.get_observations(-1)) + row = obs // num_cols + col = obs % num_rows + curr_dist = (row**2 + col**2) ** 0.5 + if curr_dist > self.max_dists[episode.id_]: + self.max_dists[episode.id_] = curr_dist + + def on_episode_end( + self, + *, + episode, + env_runner, + metrics_logger, + env, + env_index, + rl_module, + **kwargs, + ): + # Compute current maximum distance across all running episodes + # (including the just ended one). + max_dist = max(self.max_dists.values()) + metrics_logger.log_value( + key="max_dist_travelled_across_running_episodes", + value=max_dist, + window=10, + ) + if max_dist > self.max_dists_lifetime: + self.max_dists_lifetime = max_dist + del self.max_dists[episode.id_] + + def on_sample_end( + self, + *, + env_runner, + metrics_logger, + samples, + **kwargs, + ): + metrics_logger.log_value( + key="max_dist_travelled_lifetime", + value=self.max_dists_lifetime, + window=1, + ) + + +if __name__ == "__main__": + args = parser.parse_args() + + if args.algo not in ["DQN", "PPO"]: + raise ValueError( + "Curiosity example only implemented for either DQN or PPO! See the " + ) + + base_config = ( + tune.registry.get_trainable_cls(args.algo) + .get_default_config() + .environment( + "FrozenLake-v1", + env_config={ + # Use a 12x12 map. + "desc": [ + "SFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFF", + "FFFFFFFFFFFG", + ], + "is_slippery": False, + # Limit the number of steps the agent is allowed to make in the env to + # make it almost impossible to learn without the curriculum. + "max_episode_steps": 22, + }, + ) + .callbacks(MeasureMaxDistanceToStart) + .env_runners( + num_envs_per_env_runner=5 if args.algo == "PPO" else 1, + env_to_module_connector=lambda env: FlattenObservations(), + ) + .training( + learner_config_dict={ + # Intrinsic reward coefficient. + "intrinsic_reward_coeff": 0.05, + # Forward loss weight (vs inverse dynamics loss). Total ICM loss is: + # L(total ICM) = ( + # `forward_loss_weight` * L(forward) + # + (1.0 - `forward_loss_weight`) * L(inverse_dyn) + # ) + "forward_loss_weight": 0.2, + } + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={ + # The "main" RLModule (policy) to be trained by our algo. + DEFAULT_MODULE_ID: RLModuleSpec( + **( + {"model_config": {"vf_share_layers": True}} + if args.algo == "PPO" + else {} + ), + ), + # The intrinsic curiosity model. + ICM_MODULE_ID: RLModuleSpec( + module_class=IntrinsicCuriosityModel, + # Only create the ICM on the Learner workers, NOT on the + # EnvRunners. + learner_only=True, + # Configure the architecture of the ICM here. + model_config={ + "feature_dim": 288, + "feature_net_hiddens": (256, 256), + "feature_net_activation": "relu", + "inverse_net_hiddens": (256, 256), + "inverse_net_activation": "relu", + "forward_net_hiddens": (256, 256), + "forward_net_activation": "relu", + }, + ), + } + ), + # Use a different learning rate for training the ICM. + algorithm_config_overrides_per_module={ + ICM_MODULE_ID: AlgorithmConfig.overrides(lr=0.0005) + }, + ) + ) + + # Set PPO-specific hyper-parameters. + if args.algo == "PPO": + base_config.training( + num_epochs=6, + # Plug in the correct Learner class. + learner_class=PPOTorchLearnerWithCuriosity, + train_batch_size_per_learner=2000, + lr=0.0003, + ) + elif args.algo == "DQN": + base_config.training( + # Plug in the correct Learner class. + learner_class=DQNTorchLearnerWithCuriosity, + train_batch_size_per_learner=128, + lr=0.00075, + replay_buffer_config={ + "type": "PrioritizedEpisodeReplayBuffer", + "capacity": 500000, + "alpha": 0.6, + "beta": 0.4, + }, + # Epsilon exploration schedule for DQN. + epsilon=[[0, 1.0], [500000, 0.05]], + n_step=(3, 5), + double_q=True, + dueling=True, + ) + + success_key = f"{ENV_RUNNER_RESULTS}/max_dist_travelled_across_running_episodes" + stop = { + success_key: 12.0, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + } + + run_rllib_example_script_experiment( + base_config, + args, + stop=stop, + success_metric={success_key: stop[success_key]}, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd884835254d5e621d2f58598667eb642a11c00f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_experiment.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_experiment.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f022146932a898ded253196a1236cbb1ce2b1f85 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_experiment.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_logger.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_logger.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8696bd7ea00b7c0932bb35d0d69fbac1b7dbdfc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_logger.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_progress_reporter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_progress_reporter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34f6d53379b109aa70c84432466aafcc389c9799 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/__pycache__/custom_progress_reporter.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.py new file mode 100644 index 0000000000000000000000000000000000000000..66ce75c11eb62f2d79813b73bb80fdd8954cbc53 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_experiment.py @@ -0,0 +1,183 @@ +"""Example of a custom Ray Tune experiment wrapping an RLlib Algorithm. + +You should only use such a customized workflow if the following conditions apply: +- You know exactly what you are doing :) +- Configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig +is not sufficient and doesn't allow you to shape the Algorithm into behaving the way +you'd like. Note that for complex, custom evaluation procedures there are many +AlgorithmConfig options one can use (for more details, see: +https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py). # noqa +- Subclassing an RLlib Algorithm class and overriding the new class' `training_step` +method is not sufficient and doesn't allow you to define the algorithm's execution +logic the way you'd like. See an example here on how to customize the algorithm's +`training_step()` method: +https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py # noqa + + +How to run this script +---------------------- +`python [script file name].py` + + +Results to expect +----------------- +You should see the following output (at the end of the experiment) in your console: + +╭─────────────────────────────────────────────────────────────────────────────────────── +│ Trial name status iter total time (s) ts +├─────────────────────────────────────────────────────────────────────────────────────── +│ my_experiment_CartPole-v1_77083_00000 TERMINATED 10 36.7799 60000 +╰─────────────────────────────────────────────────────────────────────────────────────── +╭───────────────────────────────────────────────────────╮ +│ reward episode_len_mean episodes_this_iter │ +├───────────────────────────────────────────────────────┤ +│ 254.821 254.821 12 │ +╰───────────────────────────────────────────────────────╯ +evaluation episode returns=[500.0, 500.0, 500.0] + +Note that evaluation results (on the CartPole-v1 env) should be close to perfect +(episode return of ~500.0) as we are acting greedily inside the evaluation procedure. +""" +from typing import Dict + +import numpy as np +from ray import train, tune +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME + +torch, _ = try_import_torch() + + +def my_experiment(config: Dict): + + # Extract the number of iterations to run from the config. + train_iterations = config.pop("train-iterations", 2) + eval_episodes_to_do = config.pop("eval-episodes", 1) + + config = ( + PPOConfig() + .update_from_dict(config) + .api_stack(enable_rl_module_and_learner=True) + .environment("CartPole-v1") + ) + + # Train for n iterations with high LR. + config.training(lr=0.001) + algo_high_lr = config.build() + for _ in range(train_iterations): + train_results = algo_high_lr.train() + # Add the phase to the result dict. + train_results["phase"] = 1 + train.report(train_results) + phase_high_lr_time = train_results[NUM_ENV_STEPS_SAMPLED_LIFETIME] + checkpoint_training_high_lr = algo_high_lr.save() + algo_high_lr.stop() + + # Train for n iterations with low LR. + config.training(lr=0.00001) + algo_low_lr = config.build() + # Load state from the high-lr algo into this one. + algo_low_lr.restore(checkpoint_training_high_lr) + for _ in range(train_iterations): + train_results = algo_low_lr.train() + # Add the phase to the result dict. + train_results["phase"] = 2 + # keep time moving forward + train_results[NUM_ENV_STEPS_SAMPLED_LIFETIME] += phase_high_lr_time + train.report(train_results) + + checkpoint_training_low_lr = algo_low_lr.save() + algo_low_lr.stop() + + # After training, run a manual evaluation procedure. + + # Set the number of EnvRunners for collecting training data to 0 (local + # worker only). + config.env_runners(num_env_runners=0) + + eval_algo = config.build() + # Load state from the low-lr algo into this one. + eval_algo.restore(checkpoint_training_low_lr) + # The algo's local worker (SingleAgentEnvRunner) that holds a + # gym.vector.Env object and an RLModule for computing actions. + local_env_runner = eval_algo.env_runner + # Extract the gymnasium env object from the created algo (its local + # SingleAgentEnvRunner worker). Note that the env in this single-agent + # case is a gymnasium vector env and that we get its first sub-env here. + env = local_env_runner.env.unwrapped.envs[0] + + # The local worker (SingleAgentEnvRunner) + rl_module = local_env_runner.module + + # Run a very simple env loop and add up rewards over a single episode. + obs, infos = env.reset() + episode_returns = [] + episode_lengths = [] + sum_rewards = length = 0 + num_episodes = 0 + while num_episodes < eval_episodes_to_do: + # Call the RLModule's `forward_inference()` method to compute an + # action. + rl_module_out = rl_module.forward_inference( + { + "obs": torch.from_numpy(np.expand_dims(obs, 0)), # <- add B=1 + } + ) + action_logits = rl_module_out["action_dist_inputs"][0] # <- remove B=1 + action = np.argmax(action_logits.detach().cpu().numpy()) # act greedily + + # Step the env. + obs, reward, terminated, truncated, info = env.step(action) + + # Acculumate stats and reset the env, if necessary. + sum_rewards += reward + length += 1 + if terminated or truncated: + num_episodes += 1 + episode_returns.append(sum_rewards) + episode_lengths.append(length) + sum_rewards = length = 0 + obs, infos = env.reset() + + # Compile evaluation results. + eval_results = { + "eval_returns": episode_returns, + "eval_episode_lengths": episode_lengths, + } + # Combine the most recent training results with the just collected + # evaluation results. + results = {**train_results, **eval_results} + # Report everything. + train.report(results) + + +if __name__ == "__main__": + base_config = PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0) + # Convert to a plain dict for Tune. Note that this is usually not needed, you can + # pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object. + # However, for demonstration purposes, we show here how you can add other, arbitrary + # keys to the plain config dict and then pass these keys to your custom experiment + # function. + config_dict = base_config.to_dict() + + # Set a Special flag signalling `my_experiment` how many training steps to + # perform on each: the high learning rate and low learning rate. + config_dict["train-iterations"] = 5 + # Set a Special flag signalling `my_experiment` how many episodes to evaluate for. + config_dict["eval-episodes"] = 3 + + training_function = tune.with_resources( + my_experiment, + resources=base_config.algo_class.default_resource_request(base_config), + ) + + tuner = tune.Tuner( + training_function, + # Pass in your config dict. + param_space=config_dict, + ) + results = tuner.fit() + best_results = results.get_best_result() + + print(f"evaluation episode returns={best_results.metrics['eval_returns']}") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_logger.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..9823e47daaec56082a45d8a021af33e52514542e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_logger.py @@ -0,0 +1,137 @@ +"""Example showing how to define a custom Logger class for an RLlib Algorithm. + +The script uses the AlgorithmConfig's `debugging` API to setup the custom Logger: + +``` +config.debugging(logger_config={ + "type": [some Logger subclass], + "ctor_arg1", ..., + "ctor_arg2", ..., +}) +``` + +All keys other than "type" in the logger_config dict will be passed into the Logger +class's constructor. +By default (logger_config=None), RLlib will construct a Ray Tune UnifiedLogger object, +which logs results to JSON, CSV, and TBX. + +NOTE that a custom Logger is different from a custom `ProgressReporter`, which defines, +how the (frequent) outputs to your console will be formatted. To see an example on how +to write your own Progress reporter, see: +https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_progress_reporter.py # noqa + +Below examples include: +- Disable logging entirely. +- Using only one of tune's Json, CSV, or TBX loggers. +- Defining a custom logger (by sub-classing tune.logger.py::Logger). + + +How to run this script +---------------------- +`python [script file name].py` + + +Results to expect +----------------- +You should see log lines similar to the following in your console output. Note that +these logged lines will mix with the ones produced by Tune's default ProgressReporter. +See above link on how to setup a custom one. + +ABC Avg-return: 20.609375; pi-loss: -0.02921550187703246 +ABC Avg-return: 32.28688524590164; pi-loss: -0.023369029412534572 +ABC Avg-return: 51.92; pi-loss: -0.017113141975661456 +ABC Avg-return: 76.16; pi-loss: -0.01305474770361625 +ABC Avg-return: 100.54; pi-loss: -0.007665307738129169 +ABC Avg-return: 132.33; pi-loss: -0.005010405003325517 +ABC Avg-return: 169.65; pi-loss: -0.008397869592997183 +ABC Avg-return: 203.17; pi-loss: -0.005611495616764371 +Flushing +Closing + +""" + +from ray import air, tune +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + LEARNER_RESULTS, +) +from ray.tune.logger import Logger, LegacyLoggerCallback + + +class MyPrintLogger(Logger): + """Logs results by simply printing out everything.""" + + def _init(self): + # Custom init function. + print("Initializing ...") + # Setting up our log-line prefix. + self.prefix = self.config.get("logger_config").get("prefix") + + def on_result(self, result: dict): + # Define, what should happen on receiving a `result` (dict). + mean_return = result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + pi_loss = result[LEARNER_RESULTS][DEFAULT_MODULE_ID]["policy_loss"] + print(f"{self.prefix} " f"Avg-return: {mean_return} " f"pi-loss: {pi_loss}") + + def close(self): + # Releases all resources used by this logger. + print("Closing") + + def flush(self): + # Flushing all possible disk writes to permanent storage. + print("Flushing", flush=True) + + +if __name__ == "__main__": + config = ( + PPOConfig().environment("CartPole-v1") + # Setting up a custom logger config. + # ---------------------------------- + # The following are different examples of custom logging setups: + # 1) Disable logging entirely. + # "logger_config": { + # # Use the tune.logger.NoopLogger class for no logging. + # "type": "ray.tune.logger.NoopLogger", + # }, + # 2) Use tune's JsonLogger only. + # Alternatively, use `CSVLogger` or `TBXLogger` instead of + # `JsonLogger` in the "type" key below. + # "logger_config": { + # "type": "ray.tune.logger.JsonLogger", + # # Optional: Custom logdir (do not define this here + # # for using ~/ray_results/...). + # "logdir": "/tmp", + # }, + # 3) Custom logger (see `MyPrintLogger` class above). + .debugging( + logger_config={ + # Provide the class directly or via fully qualified class + # path. + "type": MyPrintLogger, + # `config` keys: + "prefix": "ABC", + # Optional: Custom logdir (do not define this here + # for using ~/ray_results/...). + # "logdir": "/somewhere/on/my/file/system/" + } + ) + ) + + stop = {f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 200.0} + + # Run the actual experiment (using Tune). + results = tune.Tuner( + config.algo_class, + param_space=config, + run_config=air.RunConfig( + stop=stop, + verbose=2, + # Plugin our own logger. + callbacks=[ + LegacyLoggerCallback([MyPrintLogger]), + ], + ), + ).fit() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_progress_reporter.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_progress_reporter.py new file mode 100644 index 0000000000000000000000000000000000000000..092b0710db5746912b1d348d04a6780ee5f108ec --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/ray_tune/custom_progress_reporter.py @@ -0,0 +1,119 @@ +"""Example showing how to set up a custom progress reporter for an RLlib Algorithm. + +The script sets the `progress_reporter` arg in the air.RunConfig and passes that to +Tune's Tuner: + +``` +tune.Tuner( + param_space=..., # <- your RLlib config + run_config=air.RunConfig( + progress_reporter=[some already instantiated TuneReporterBase object], + ), +) +``` + +By default (progress_reporter=None), Tune will construct a default `CLIReporter` object, +which reports the episode mean return, number of env steps sampled and -trained, and +the total number of episodes run thus far. + +NOTE that a custom progress reporter is different from a custom `Logger`, which defines, +how the (frequent) results are being formatted and written to e.g. a logfile. +To see an example on how to write your own Logger, see: +https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_logger.py + + +How to run this script +---------------------- +`python [script file name].py + + +Results to expect +----------------- +You should see something similar to the following in your console output: + ++---------------------+------------+-----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +|---------------------+------------+-----------------+--------+------------------+ +| PPO_env_bb503_00000 | TERMINATED | 127.0.0.1:26303 | 5 | 30.3823 | ++---------------------+------------+-----------------+--------+------------------+ ++-------+-------------------+------------------+------------------+------------------+ +| ts | combined return | return policy1 | return policy2 | return policy3 | +|-------+-------------------+------------------+------------------+------------------| +| 20000 | 258.7 | 103.4 | 88.84 | 87.86 | ++-------+-------------------+------------------+------------------+------------------+ + +""" +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) + + +my_multi_agent_progress_reporter = tune.CLIReporter( + # In the following dict, the keys are the (possibly nested) keys that can be found + # in RLlib's (PPO's) result dict, produced at every training iteration, and the + # values are the column names you would like to see in your console reports. + # Note that for nested result dict keys, you need to use slashes "/" to define the + # exact path. + metric_columns={ + **{ + TRAINING_ITERATION: "iter", + "time_total_s": "total time (s)", + NUM_ENV_STEPS_SAMPLED_LIFETIME: "ts", + # RLlib always sums up all agents' rewards and reports it under: + # result_dict[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]. + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": "combined return", + }, + # Because RLlib sums up all returns of all agents, we would like to also + # see the individual agents' returns. We can find these under the result dict's + # 'env_runners/module_episode_returns_mean/' key (then the policy ID): + **{ + f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/{pid}": f"return {pid}" + for pid in ["policy1", "policy2", "policy3"] + }, + }, +) + + +if __name__ == "__main__": + # Force Tuner to use old progress output as the new one silently ignores our custom + # `CLIReporter`. + # TODO (sven): Find out why we require this hack. + import os + + os.environ["RAY_AIR_NEW_OUTPUT"] = "0" + + # Register our multi-agent env with a fixed number of agents. + # The agents' IDs are 0, 1, and 2. + tune.register_env("env", lambda _: MultiAgentCartPole({"num_agents": 3})) + + config = ( + PPOConfig() + .environment("env") + .multi_agent( + # Define 3 policies. Note that in our simple setup, they are all configured + # the exact same way (with a PPO default RLModule/NN). + policies={"policy1", "policy2", "policy3"}, + # Map agent 0 to "policy1", etc.. + policy_mapping_fn=lambda agent_id, episode: f"policy{agent_id + 1}", + ) + ) + + stop = {f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 200.0} + + # Run the actual experiment (using Tune). + results = tune.Tuner( + config.algo_class, + param_space=config, + run_config=air.RunConfig( + stop=stop, + verbose=2, + # Plugin our own progress reporter. + progress_reporter=my_multi_agent_progress_reporter, + ), + ).fit() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f4c86bd3ff501c4175cbf3181af989acf82b57d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/action_masking_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/action_masking_rl_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..262c9dc89928211e5258534dd5359b96e66e9a21 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/action_masking_rl_module.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_cnn_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_cnn_rl_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c75e64039c0d1c542ace1d404869f3ecc87bcfff Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_cnn_rl_module.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_lstm_rl_module.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_lstm_rl_module.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eca8a43ed68297af92126da58564fdc23e539ea0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/custom_lstm_rl_module.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb426f898206288dae05fca65ece00a9e6f566ac Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a78b325863feab2de3b97a6959bd1c9e314aae2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/pretraining_single_agent_training_multi_agent.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/pretraining_single_agent_training_multi_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..616459ffdf8f000876aea495a2339f77b586d415 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/__pycache__/pretraining_single_agent_training_multi_agent.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/action_masking_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/action_masking_rl_module.py new file mode 100644 index 0000000000000000000000000000000000000000..fd9984b9aceba10be7660f35cf1107af81a261d8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/action_masking_rl_module.py @@ -0,0 +1,127 @@ +"""An example script showing how to define and load an `RLModule` that applies +action masking + +This example: + - Defines an `RLModule` that applies action masking. + - It does so by using a `gymnasium.spaces.dict.Dict` observation space + with two keys, namely `"observations"`, holding the original observations + and `"action_mask"` defining the action mask for the current environment + state. Note, by this definition you can wrap any `gymnasium` environment + and use it for this module. + - Furthermore, it derives its `TorchRLModule` from the `PPOTorchRLModule` and + can therefore be easily plugged into our `PPO` algorithm. + - It overrides the `forward` methods of the `PPOTorchRLModule` to apply the + action masking and it overrides the `_compute_values` method for GAE + computation to extract the `"observations"` from the batch `Columns.OBS` + key. + - It uses the custom `ActionMaskEnv` that defines for each step a new action + mask that defines actions that are allowed (1.0) and others that are not + (0.0). + - It runs 10 iterations with PPO and finishes. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-env-runners 2` + +Control the number of `EnvRunner`s with the `--num-env-runners` flag. This +will increase the sampling speed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should expect a mean episode reward of around 0.35. The environment is a random +environment paying out random rewards - so the agent cannot learn, but it can obey the +action mask and should do so (no `AssertionError` should happen). +After 40,000 environment steps and 10 training iterations the run should stop +successfully: + ++-------------------------------+------------+----------------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-------------------------------+------------+----------------------+--------+ +| PPO_ActionMaskEnv_dedc8_00000 | TERMINATED | 192.168.1.178:103298 | 10 | ++-------------------------------+------------+----------------------+--------+ ++------------------+------------------------+------------------------+ +| total time (s) | num_env_steps_sample | num_env_steps_traine | +| | d_lifetime | d_lifetime | ++------------------+------------------------+------------------------+ +| 57.9207 | 40000 | 40000 | ++------------------+------------------------+------------------------+ +*------------------------+ +| num_episodes_lifetim | +| e | ++------------------------| +| 3898 | ++------------------------+ +""" +from gymnasium.spaces import Box, Discrete + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.envs.classes.action_mask_env import ActionMaskEnv +from ray.rllib.examples.rl_modules.classes.action_masking_rlm import ( + ActionMaskingTorchRLModule, +) + +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + + +parser = add_rllib_example_script_args( + default_iters=10, + default_timesteps=100000, + default_reward=150.0, +) + +if __name__ == "__main__": + args = parser.parse_args() + + if args.algo != "PPO": + raise ValueError("This example only supports PPO. Please use --algo=PPO.") + + base_config = ( + PPOConfig() + .environment( + env=ActionMaskEnv, + env_config={ + "action_space": Discrete(100), + # This defines the 'original' observation space that is used in the + # `RLModule`. The environment will wrap this space into a + # `gym.spaces.Dict` together with an 'action_mask' that signals the + # `RLModule` to adapt the action distribution inputs for the underlying + # `DefaultPPORLModule`. + "observation_space": Box(-1.0, 1.0, (5,)), + }, + ) + .rl_module( + # We need to explicitly specify here RLModule to use and + # the catalog needed to build it. + rl_module_spec=RLModuleSpec( + module_class=ActionMaskingTorchRLModule, + model_config={ + "head_fcnet_hiddens": [64, 64], + "head_fcnet_activation": "relu", + }, + ), + ) + .evaluation( + evaluation_num_env_runners=1, + evaluation_interval=1, + # Run evaluation parallel to training to speed up the example. + evaluation_parallel_to_training=True, + ) + ) + + # Run the example (with Tune). + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b6525851c1ac8cecc0a4491a2bf04f21969ac921 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__init__.py @@ -0,0 +1,10 @@ +from ray.rllib.examples.rl_modules.classes.rock_paper_scissors_heuristic_rlm import ( + AlwaysSameHeuristicRLM, + BeatLastHeuristicRLM, +) + + +__all__ = [ + "AlwaysSameHeuristicRLM", + "BeatLastHeuristicRLM", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24e992114ca3c95ffed5418c7d40534ac88aebc0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/action_masking_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/action_masking_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7aaf961ca0ed313a9807bcc4a8f9a814e5c8e1d4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/action_masking_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/autoregressive_actions_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/autoregressive_actions_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc0115d45319cbbf4b49479146c42f4455fae9d8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/autoregressive_actions_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/intrinsic_curiosity_model_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/intrinsic_curiosity_model_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e095c8d0ffe1d40e967ffeb7dad1c5885f81181 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/intrinsic_curiosity_model_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/lstm_containing_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/lstm_containing_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94bd74a1bff1d3ddeb4b549dec3ba49861a4af6b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/lstm_containing_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/mobilenet_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/mobilenet_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9442c5bff6193d6689decb0231acd2695429e19 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/mobilenet_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/modelv2_to_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/modelv2_to_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf4d4913065792bc4b37cf20161a96a72f768a12 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/modelv2_to_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/random_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/random_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..573676b1d7f312d5c1f7ac0d4ce35665bcb18f8f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/random_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/rock_paper_scissors_heuristic_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/rock_paper_scissors_heuristic_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..917c43b3ccc4278de0a66dc30eea53d4659143f7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/rock_paper_scissors_heuristic_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/tiny_atari_cnn_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/tiny_atari_cnn_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ac175eea839c3b9f3de0e840702b2f88e15210e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/tiny_atari_cnn_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_torch_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_torch_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3edcfe7ad1134c998bb59d845a8276d3b5c9dced Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_torch_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_using_shared_encoder_rlm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_using_shared_encoder_rlm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6bc98889ec1e95adbf42fa479eb836086366749 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/__pycache__/vpg_using_shared_encoder_rlm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/action_masking_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/action_masking_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..aee91f203c8790f7688094f79bec5eeeb4fadf33 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/action_masking_rlm.py @@ -0,0 +1,210 @@ +import gymnasium as gym +from typing import Dict, Optional, Tuple, Union + +from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.torch_utils import FLOAT_MIN +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + + +class ActionMaskingRLModule(RLModule): + """An RLModule that implements an action masking for safe RL. + + This RLModule implements action masking to avoid unsafe/unwanted actions + dependent on the current state (observations). It does so by using an + environment generated action mask defining which actions are allowed and + which should be avoided. The action mask is extracted from the + environment's `gymnasium.spaces.dict.Dict` observation and applied after + the module's `forward`-pass to the action logits. The resulting action + logits prevent unsafe/unwanted actions to be sampled from the corresponding + action distribution. + + Note, this RLModule is implemented for the `PPO` algorithm only. It is not + guaranteed to work with other algorithms. Furthermore, not that for this + module to work it requires an environment with a `gymnasium.spaces.dict.Dict` + observation space containing tow key, `"action_mask"` and `"observations"`. + """ + + @override(RLModule) + def __init__( + self, + *, + observation_space: Optional[gym.Space] = None, + action_space: Optional[gym.Space] = None, + inference_only: Optional[bool] = None, + learner_only: bool = False, + model_config: Optional[Union[dict, DefaultModelConfig]] = None, + catalog_class=None, + **kwargs, + ): + # If observation space is not of type `Dict` raise an error. + if not isinstance(observation_space, gym.spaces.dict.Dict): + raise ValueError( + "This RLModule requires the environment to provide a " + "`gym.spaces.Dict` observation space of the form: \n" + " {'action_mask': Box(0.0, 1.0, shape=(self.action_space.n,))," + " 'observation_space': self.observation_space}" + ) + + # While the environment holds an observation space that contains, both, + # the action mask and the original observation space, the 'RLModule' + # receives only the `"observation"` element of the space, but not the + # action mask. + self.observation_space_with_mask = observation_space + self.observation_space = observation_space["observations"] + + # Keeps track if observation specs have been checked already. + self._checked_observations = False + + # The DefaultPPORLModule, in its constructor will build networks for the + # original observation space (i.e. without the action mask). + super().__init__( + observation_space=self.observation_space, + action_space=action_space, + inference_only=inference_only, + learner_only=learner_only, + model_config=model_config, + catalog_class=catalog_class, + **kwargs, + ) + + +class ActionMaskingTorchRLModule(ActionMaskingRLModule, PPOTorchRLModule): + @override(PPOTorchRLModule) + def setup(self): + super().setup() + # We need to reset here the observation space such that the + # super`s (`PPOTorchRLModule`) observation space is the + # original space (i.e. without the action mask) and `self`'s + # observation space contains the action mask. + self.observation_space = self.observation_space_with_mask + + @override(PPOTorchRLModule) + def _forward_inference( + self, batch: Dict[str, TensorType], **kwargs + ) -> Dict[str, TensorType]: + # Preprocess the original batch to extract the action mask. + action_mask, batch = self._preprocess_batch(batch) + # Run the forward pass. + outs = super()._forward_inference(batch, **kwargs) + # Mask the action logits and return. + return self._mask_action_logits(outs, action_mask) + + @override(PPOTorchRLModule) + def _forward_exploration( + self, batch: Dict[str, TensorType], **kwargs + ) -> Dict[str, TensorType]: + # Preprocess the original batch to extract the action mask. + action_mask, batch = self._preprocess_batch(batch) + # Run the forward pass. + outs = super()._forward_exploration(batch, **kwargs) + # Mask the action logits and return. + return self._mask_action_logits(outs, action_mask) + + @override(PPOTorchRLModule) + def _forward_train( + self, batch: Dict[str, TensorType], **kwargs + ) -> Dict[str, TensorType]: + # Run the forward pass. + outs = super()._forward_train(batch, **kwargs) + # Mask the action logits and return. + return self._mask_action_logits(outs, batch["action_mask"]) + + @override(ValueFunctionAPI) + def compute_values(self, batch: Dict[str, TensorType], embeddings=None): + # Check, if the observations are still in `dict` form. + if isinstance(batch[Columns.OBS], dict): + # Preprocess the batch to extract the `observations` to `Columns.OBS`. + action_mask, batch = self._preprocess_batch(batch) + # NOTE: Because we manipulate the batch we need to add the `action_mask` + # to the batch to access them in `_forward_train`. + batch["action_mask"] = action_mask + # Call the super's method to compute values for GAE. + return super().compute_values(batch, embeddings) + + def _preprocess_batch( + self, batch: Dict[str, TensorType], **kwargs + ) -> Tuple[TensorType, Dict[str, TensorType]]: + """Extracts observations and action mask from the batch + + Args: + batch: A dictionary containing tensors (at least `Columns.OBS`) + + Returns: + A tuple with the action mask tensor and the modified batch containing + the original observations. + """ + # Check observation specs for action mask and observation keys. + self._check_batch(batch) + + # Extract the available actions tensor from the observation. + action_mask = batch[Columns.OBS].pop("action_mask") + + # Modify the batch for the `DefaultPPORLModule`'s `forward` method, i.e. + # pass only `"obs"` into the `forward` method. + batch[Columns.OBS] = batch[Columns.OBS].pop("observations") + + # Return the extracted action mask and the modified batch. + return action_mask, batch + + def _mask_action_logits( + self, batch: Dict[str, TensorType], action_mask: TensorType + ) -> Dict[str, TensorType]: + """Masks the action logits for the output of `forward` methods + + Args: + batch: A dictionary containing tensors (at least action logits). + action_mask: A tensor containing the action mask for the current + observations. + + Returns: + A modified batch with masked action logits for the action distribution + inputs. + """ + # Convert action mask into an `[0.0][-inf]`-type mask. + inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN) + + # Mask the logits. + batch[Columns.ACTION_DIST_INPUTS] += inf_mask + + # Return the batch with the masked action logits. + return batch + + def _check_batch(self, batch: Dict[str, TensorType]) -> Optional[ValueError]: + """Assert that the batch includes action mask and observations. + + Args: + batch: A dicitonary containing tensors (at least `Columns.OBS`) to be + checked. + + Raises: + `ValueError` if the column `Columns.OBS` does not contain observations + and action mask. + """ + if not self._checked_observations: + if "action_mask" not in batch[Columns.OBS]: + raise ValueError( + "No action mask found in observation. This `RLModule` requires " + "the environment to provide observations that include an " + "action mask (i.e. an observation space of the Dict space " + "type that looks as follows: \n" + "{'action_mask': Box(0.0, 1.0, shape=(self.action_space.n,))," + "'observations': self.observation_space}" + ) + if "observations" not in batch[Columns.OBS]: + raise ValueError( + "No observations found in observation. This 'RLModule` requires " + "the environment to provide observations that include the original " + "observations under a key `'observations'` in a dict (i.e. an " + "observation space of the Dict space type that looks as follows: \n" + "{'action_mask': Box(0.0, 1.0, shape=(self.action_space.n,))," + "'observations': }" + ) + self._checked_observations = True diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..e65783ae4a86255e03e693ada33efbb01ea70488 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/autoregressive_actions_rlm.py @@ -0,0 +1,135 @@ +from typing import Dict + +import gymnasium as gym + +from ray.rllib.core import Columns +from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule +from ray.rllib.models.torch.torch_distributions import ( + TorchCategorical, + TorchDiagGaussian, + TorchMultiDistribution, +) +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.torch_utils import one_hot +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + + +class AutoregressiveActionsRLM(TorchRLModule, ValueFunctionAPI): + """An RLModule that uses an autoregressive action distribution. + + Actions are sampled in two steps. The first (prior) action component is sampled from + a categorical distribution. Then, the second (posterior) action component is sampled + from a posterior distribution that depends on the first action component and the + other input data (observations). + + Note, this RLModule works in combination with any algorithm, whose Learners require + the `ValueFunctionAPI`. + """ + + @override(RLModule) + def setup(self): + super().setup() + + # Assert the action space is correct. + assert isinstance(self.action_space, gym.spaces.Tuple) + assert isinstance(self.action_space[0], gym.spaces.Discrete) + assert self.action_space[0].n == 3 + assert isinstance(self.action_space[1], gym.spaces.Box) + + self._prior_net = nn.Sequential( + nn.Linear( + in_features=self.observation_space.shape[0], + out_features=256, + ), + nn.Tanh(), + nn.Linear(in_features=256, out_features=self.action_space[0].n), + ) + + self._posterior_net = nn.Sequential( + nn.Linear( + in_features=self.observation_space.shape[0] + self.action_space[0].n, + out_features=256, + ), + nn.Tanh(), + nn.Linear(in_features=256, out_features=self.action_space[1].shape[0] * 2), + ) + + # Build the value function head. + self._value_net = nn.Sequential( + nn.Linear( + in_features=self.observation_space.shape[0], + out_features=256, + ), + nn.Tanh(), + nn.Linear(in_features=256, out_features=1), + ) + + @override(TorchRLModule) + def _forward_inference(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]: + return self._pi(batch[Columns.OBS], inference=True) + + @override(TorchRLModule) + def _forward_exploration( + self, batch: Dict[str, TensorType], **kwargs + ) -> Dict[str, TensorType]: + return self._pi(batch[Columns.OBS], inference=False) + + @override(TorchRLModule) + def _forward_train(self, batch: Dict[str, TensorType]) -> Dict[str, TensorType]: + return self._forward_exploration(batch) + + @override(ValueFunctionAPI) + def compute_values(self, batch: Dict[str, TensorType], embeddings=None): + # Value function forward pass. + vf_out = self._value_net(batch[Columns.OBS]) + # Squeeze out last dimension (single node value head). + return vf_out.squeeze(-1) + + def _pi(self, obs, inference: bool): + # Prior forward pass. + prior_out = self._prior_net(obs) + dist_a1 = TorchCategorical.from_logits(prior_out) + + # If in inference mode, we need to set the distribution to be deterministic. + if inference: + dist_a1 = dist_a1.to_deterministic() + # Sample a1. + a1 = dist_a1.sample() + + # Posterior forward pass. + posterior_batch = torch.cat( + [obs, one_hot(a1, self.action_space[0])], + dim=-1, + ) + posterior_out = self._posterior_net(posterior_batch) + dist_a2 = TorchDiagGaussian.from_logits(posterior_out) + if inference: + dist_a2 = dist_a2.to_deterministic() + + a2 = dist_a2.sample() + + actions = (a1, a2) + + # We need the log-probabilities for the loss. + outputs = { + Columns.ACTION_LOGP: ( + TorchMultiDistribution((dist_a1, dist_a2)).logp(actions) + ), + Columns.ACTION_DIST_INPUTS: torch.cat([prior_out, posterior_out], dim=-1), + # Concatenate the prior and posterior actions and log probabilities. + Columns.ACTIONS: actions, + } + + return outputs + + @override(TorchRLModule) + def get_inference_action_dist_cls(self): + return TorchMultiDistribution.get_partial_dist_cls( + child_distribution_cls_struct=(TorchCategorical, TorchDiagGaussian), + input_lens=(3, 2), + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/intrinsic_curiosity_model_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/intrinsic_curiosity_model_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..efa3fcdb1d6ba36c07039c466489acce5c731cea --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/intrinsic_curiosity_model_rlm.py @@ -0,0 +1,240 @@ +from typing import Any, Dict, TYPE_CHECKING + +import tree # pip install dm_tree + +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.apis import SelfSupervisedLossAPI +from ray.rllib.core.rl_module.torch import TorchRLModule +from ray.rllib.models.utils import get_activation_fn +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.torch_utils import one_hot +from ray.rllib.utils.typing import ModuleID + +if TYPE_CHECKING: + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + from ray.rllib.core.learner.torch.torch_learner import TorchLearner + +torch, nn = try_import_torch() + + +class IntrinsicCuriosityModel(TorchRLModule, SelfSupervisedLossAPI): + """An intrinsic curiosity model (ICM) as TorchRLModule for better exploration. + + For more details, see: + [1] Curiosity-driven Exploration by Self-supervised Prediction + Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017. + https://arxiv.org/pdf/1705.05363.pdf + + Learns a simplified model of the environment based on three networks: + 1) Embedding observations into latent space ("feature" network). + 2) Predicting the action, given two consecutive embedded observations + ("inverse" network). + 3) Predicting the next embedded obs, given an obs and action + ("forward" network). + + The less the agent is able to predict the actually observed next feature + vector, given obs and action (through the forwards network), the larger the + "intrinsic reward", which will be added to the extrinsic reward. + Therefore, if a state transition was unexpected, the agent becomes + "curious" and will further explore this transition leading to better + exploration in sparse rewards environments. + + .. testcode:: + + import numpy as np + import gymnasium as gym + import torch + + from ray.rllib.core import Columns + from ray.rllib.examples.rl_modules.classes.intrinsic_curiosity_model_rlm import ( # noqa + IntrinsicCuriosityModel + ) + + B = 10 # batch size + O = 4 # obs (1D) dim + A = 2 # num actions + f = 25 # feature dim + + # Construct the RLModule. + icm_net = IntrinsicCuriosityModel( + observation_space=gym.spaces.Box(-1.0, 1.0, (O,), np.float32), + action_space=gym.spaces.Discrete(A), + ) + + # Create some dummy input. + obs = torch.from_numpy( + np.random.random_sample(size=(B, O)).astype(np.float32) + ) + next_obs = torch.from_numpy( + np.random.random_sample(size=(B, O)).astype(np.float32) + ) + actions = torch.from_numpy( + np.random.random_integers(0, A - 1, size=(B,)) + ) + input_dict = { + Columns.OBS: obs, + Columns.NEXT_OBS: next_obs, + Columns.ACTIONS: actions, + } + + # Call `forward_train()` to get phi (feature vector from obs), next-phi + # (feature vector from next obs), and the intrinsic rewards (individual, per + # batch-item forward loss values). + print(icm_net.forward_train(input_dict)) + + # Print out the number of parameters. + num_all_params = sum(int(np.prod(p.size())) for p in icm_net.parameters()) + print(f"num params = {num_all_params}") + """ + + @override(TorchRLModule) + def setup(self): + # Get the ICM achitecture settings from the `model_config` attribute: + cfg = self.model_config + + feature_dim = cfg.get("feature_dim", 288) + + # Build the feature model (encoder of observations to feature space). + layers = [] + dense_layers = cfg.get("feature_net_hiddens", (256, 256)) + # `in_size` is the observation space (assume a simple Box(1D)). + in_size = self.observation_space.shape[0] + for out_size in dense_layers: + layers.append(nn.Linear(in_size, out_size)) + if cfg.get("feature_net_activation") not in [None, "linear"]: + layers.append( + get_activation_fn(cfg["feature_net_activation"], "torch")() + ) + in_size = out_size + # Last feature layer of n nodes (feature dimension). + layers.append(nn.Linear(in_size, feature_dim)) + self._feature_net = nn.Sequential(*layers) + + # Build the inverse model (predicting the action between two observations). + layers = [] + dense_layers = cfg.get("inverse_net_hiddens", (256,)) + # `in_size` is 2x the feature dim. + in_size = feature_dim * 2 + for out_size in dense_layers: + layers.append(nn.Linear(in_size, out_size)) + if cfg.get("inverse_net_activation") not in [None, "linear"]: + layers.append( + get_activation_fn(cfg["inverse_net_activation"], "torch")() + ) + in_size = out_size + # Last feature layer of n nodes (action space). + layers.append(nn.Linear(in_size, self.action_space.n)) + self._inverse_net = nn.Sequential(*layers) + + # Build the forward model (predicting the next observation from current one and + # action). + layers = [] + dense_layers = cfg.get("forward_net_hiddens", (256,)) + # `in_size` is the feature dim + action space (one-hot). + in_size = feature_dim + self.action_space.n + for out_size in dense_layers: + layers.append(nn.Linear(in_size, out_size)) + if cfg.get("forward_net_activation") not in [None, "linear"]: + layers.append( + get_activation_fn(cfg["forward_net_activation"], "torch")() + ) + in_size = out_size + # Last feature layer of n nodes (feature dimension). + layers.append(nn.Linear(in_size, feature_dim)) + self._forward_net = nn.Sequential(*layers) + + @override(TorchRLModule) + def _forward_train(self, batch, **kwargs): + # Push both observations through feature net to get feature vectors (phis). + # We cat/batch them here for efficiency reasons (save one forward pass). + obs = tree.map_structure( + lambda obs, next_obs: torch.cat([obs, next_obs], dim=0), + batch[Columns.OBS], + batch[Columns.NEXT_OBS], + ) + phis = self._feature_net(obs) + # Split again to yield 2 individual phi tensors. + phi, next_phi = torch.chunk(phis, 2) + + # Predict next feature vector (next_phi) with forward model (using obs and + # actions). + predicted_next_phi = self._forward_net( + torch.cat( + [ + phi, + one_hot(batch[Columns.ACTIONS].long(), self.action_space).float(), + ], + dim=-1, + ) + ) + + # Forward loss term: Predicted phi - given phi and action - vs actually observed + # phi (square-root of L2 norm). Note that this is the intrinsic reward that + # will be used and the mean of this is the forward net loss. + forward_l2_norm_sqrt = 0.5 * torch.sum( + torch.pow(predicted_next_phi - next_phi, 2.0), dim=-1 + ) + + output = { + Columns.INTRINSIC_REWARDS: forward_l2_norm_sqrt, + # Computed feature vectors (used to compute the losses later). + "phi": phi, + "next_phi": next_phi, + } + + return output + + @override(SelfSupervisedLossAPI) + def compute_self_supervised_loss( + self, + *, + learner: "TorchLearner", + module_id: ModuleID, + config: "AlgorithmConfig", + batch: Dict[str, Any], + fwd_out: Dict[str, Any], + ) -> Dict[str, Any]: + module = learner.module[module_id].unwrapped() + + # Forward net loss. + forward_loss = torch.mean(fwd_out[Columns.INTRINSIC_REWARDS]) + + # Inverse loss term (predicted action that led from phi to phi' vs + # actual action taken). + dist_inputs = module._inverse_net( + torch.cat([fwd_out["phi"], fwd_out["next_phi"]], dim=-1) + ) + action_dist = module.get_train_action_dist_cls().from_logits(dist_inputs) + + # Neg log(p); p=probability of observed action given the inverse-NN + # predicted action distribution. + inverse_loss = -action_dist.logp(batch[Columns.ACTIONS]) + inverse_loss = torch.mean(inverse_loss) + + # Calculate the ICM loss. + total_loss = ( + config.learner_config_dict["forward_loss_weight"] * forward_loss + + (1.0 - config.learner_config_dict["forward_loss_weight"]) * inverse_loss + ) + + learner.metrics.log_dict( + { + "mean_intrinsic_rewards": forward_loss, + "forward_loss": forward_loss, + "inverse_loss": inverse_loss, + }, + key=module_id, + window=1, + ) + + return total_loss + + # Inference and exploration not supported (this is a world-model that should only + # be used for training). + @override(TorchRLModule) + def _forward(self, batch, **kwargs): + raise NotImplementedError( + "`IntrinsicCuriosityModel` should only be used for training! " + "Only calls to `forward_train()` supported." + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/lstm_containing_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/lstm_containing_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..0e05bd2c4c7f2aaf2e02eb4df834a85f2ec661f1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/lstm_containing_rlm.py @@ -0,0 +1,152 @@ +from typing import Any, Dict, Optional + +import numpy as np + +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI +from ray.rllib.core.rl_module.torch import TorchRLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + + +class LSTMContainingRLModule(TorchRLModule, ValueFunctionAPI): + """An example TorchRLModule that contains an LSTM layer. + + .. testcode:: + + import numpy as np + import gymnasium as gym + + B = 10 # batch size + T = 5 # seq len + e = 25 # embedding dim + CELL = 32 # LSTM cell size + + # Construct the RLModule. + my_net = LSTMContainingRLModule( + observation_space=gym.spaces.Box(-1.0, 1.0, (e,), np.float32), + action_space=gym.spaces.Discrete(4), + model_config={"lstm_cell_size": CELL} + ) + + # Create some dummy input. + obs = torch.from_numpy( + np.random.random_sample(size=(B, T, e) + ).astype(np.float32)) + state_in = my_net.get_initial_state() + # Repeat state_in across batch. + state_in = tree.map_structure( + lambda s: torch.from_numpy(s).unsqueeze(0).repeat(B, 1), state_in + ) + input_dict = { + Columns.OBS: obs, + Columns.STATE_IN: state_in, + } + + # Run through all 3 forward passes. + print(my_net.forward_inference(input_dict)) + print(my_net.forward_exploration(input_dict)) + print(my_net.forward_train(input_dict)) + + # Print out the number of parameters. + num_all_params = sum(int(np.prod(p.size())) for p in my_net.parameters()) + print(f"num params = {num_all_params}") + """ + + @override(TorchRLModule) + def setup(self): + """Use this method to create all the model components that you require. + + Feel free to access the following useful properties in this class: + - `self.model_config`: The config dict for this RLModule class, + which should contain flxeible settings, for example: {"hiddens": [256, 256]}. + - `self.observation|action_space`: The observation and action space that + this RLModule is subject to. Note that the observation space might not be the + exact space from your env, but that it might have already gone through + preprocessing through a connector pipeline (for example, flattening, + frame-stacking, mean/std-filtering, etc..). + """ + # Assume a simple Box(1D) tensor as input shape. + in_size = self.observation_space.shape[0] + + # Get the LSTM cell size from the `model_config` attribute: + self._lstm_cell_size = self.model_config.get("lstm_cell_size", 256) + self._lstm = nn.LSTM(in_size, self._lstm_cell_size, batch_first=True) + in_size = self._lstm_cell_size + + # Build a sequential stack. + layers = [] + # Get the dense layer pre-stack configuration from the same config dict. + dense_layers = self.model_config.get("dense_layers", [128, 128]) + for out_size in dense_layers: + # Dense layer. + layers.append(nn.Linear(in_size, out_size)) + # ReLU activation. + layers.append(nn.ReLU()) + in_size = out_size + + self._fc_net = nn.Sequential(*layers) + + # Logits layer (no bias, no activation). + self._pi_head = nn.Linear(in_size, self.action_space.n) + # Single-node value layer. + self._values = nn.Linear(in_size, 1) + + @override(TorchRLModule) + def get_initial_state(self) -> Any: + return { + "h": np.zeros(shape=(self._lstm_cell_size,), dtype=np.float32), + "c": np.zeros(shape=(self._lstm_cell_size,), dtype=np.float32), + } + + @override(TorchRLModule) + def _forward(self, batch, **kwargs): + # Compute the basic 1D embedding tensor (inputs to policy- and value-heads). + embeddings, state_outs = self._compute_embeddings_and_state_outs(batch) + logits = self._pi_head(embeddings) + + # Return logits as ACTION_DIST_INPUTS (categorical distribution). + # Note that the default `GetActions` connector piece (in the EnvRunner) will + # take care of argmax-"sampling" from the logits to yield the inference (greedy) + # action. + return { + Columns.ACTION_DIST_INPUTS: logits, + Columns.STATE_OUT: state_outs, + } + + @override(TorchRLModule) + def _forward_train(self, batch, **kwargs): + # Same logic as _forward, but also return embeddings to be used by value + # function branch during training. + embeddings, state_outs = self._compute_embeddings_and_state_outs(batch) + logits = self._pi_head(embeddings) + return { + Columns.ACTION_DIST_INPUTS: logits, + Columns.STATE_OUT: state_outs, + Columns.EMBEDDINGS: embeddings, + } + + # We implement this RLModule as a ValueFunctionAPI RLModule, so it can be used + # by value-based methods like PPO or IMPALA. + @override(ValueFunctionAPI) + def compute_values( + self, batch: Dict[str, Any], embeddings: Optional[Any] = None + ) -> TensorType: + if embeddings is None: + embeddings, _ = self._compute_embeddings_and_state_outs(batch) + values = self._values(embeddings).squeeze(-1) + return values + + def _compute_embeddings_and_state_outs(self, batch): + obs = batch[Columns.OBS] + state_in = batch[Columns.STATE_IN] + h, c = state_in["h"], state_in["c"] + # Unsqueeze the layer dim (we only have 1 LSTM layer). + embeddings, (h, c) = self._lstm(obs, (h.unsqueeze(0), c.unsqueeze(0))) + # Push through our FC net. + embeddings = self._fc_net(embeddings) + # Squeeze the layer dim (we only have 1 LSTM layer). + return embeddings, {"h": h.squeeze(0), "c": c.squeeze(0)} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/mobilenet_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..104792fda3fc76b89d99752b86a9d877814fa5d9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/mobilenet_rlm.py @@ -0,0 +1,80 @@ +""" +This example shows how to take full control over what models and action distribution +are being built inside an RL Module. With this pattern, we can bypass a Catalog and +explicitly define our own models within a given RL Module. +""" +# __sphinx_doc_begin__ +import gymnasium as gym +import numpy as np + +from ray.rllib.algorithms.ppo.ppo import PPOConfig +from ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_module import ( + DefaultPPOTorchRLModule, +) +from ray.rllib.core.models.configs import MLPHeadConfig +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.envs.classes.random_env import RandomEnv +from ray.rllib.examples._old_api_stack.models.mobilenet_v2_encoder import ( + MobileNetV2EncoderConfig, + MOBILENET_INPUT_SHAPE, +) +from ray.rllib.core.models.configs import ActorCriticEncoderConfig + + +class MobileNetTorchPPORLModule(DefaultPPOTorchRLModule): + """A DefaultPPORLModule with mobilenet v2 as an encoder. + + The idea behind this model is to demonstrate how we can bypass catalog to + take full control over what models and action distribution are being built. + In this example, we do this to modify an existing RLModule with a custom encoder. + """ + + def setup(self): + mobilenet_v2_config = MobileNetV2EncoderConfig() + # Since we want to use PPO, which is an actor-critic algorithm, we need to + # use an ActorCriticEncoderConfig to wrap the base encoder config. + actor_critic_encoder_config = ActorCriticEncoderConfig( + base_encoder_config=mobilenet_v2_config + ) + + self.encoder = actor_critic_encoder_config.build(framework="torch") + mobilenet_v2_output_dims = mobilenet_v2_config.output_dims + + pi_config = MLPHeadConfig( + input_dims=mobilenet_v2_output_dims, + output_layer_dim=2, + ) + + vf_config = MLPHeadConfig( + input_dims=mobilenet_v2_output_dims, output_layer_dim=1 + ) + + self.pi = pi_config.build(framework="torch") + self.vf = vf_config.build(framework="torch") + + +config = ( + PPOConfig() + .rl_module(rl_module_spec=RLModuleSpec(module_class=MobileNetTorchPPORLModule)) + .environment( + RandomEnv, + env_config={ + "action_space": gym.spaces.Discrete(2), + # Test a simple Image observation space. + "observation_space": gym.spaces.Box( + 0.0, + 1.0, + shape=MOBILENET_INPUT_SHAPE, + dtype=np.float32, + ), + }, + ) + .env_runners(num_env_runners=0) + # The following training settings make it so that a training iteration is very + # quick. This is just for the sake of this example. PPO will not learn properly + # with these settings! + .training(train_batch_size_per_learner=32, minibatch_size=16, num_epochs=1) +) + +config.build().train() +# __sphinx_doc_end__ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/modelv2_to_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/modelv2_to_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..5efbead7e66f01fa2dc5edfa4fc7afa85991e5c6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/modelv2_to_rlm.py @@ -0,0 +1,206 @@ +import pathlib +from typing import Any, Dict, Optional + +import tree +from ray.rllib.core import Columns, DEFAULT_POLICY_ID +from ray.rllib.core.rl_module.apis import ValueFunctionAPI +from ray.rllib.core.rl_module.torch import TorchRLModule +from ray.rllib.models.torch.torch_distributions import ( + TorchCategorical, + TorchDiagGaussian, + TorchMultiCategorical, + TorchMultiDistribution, + TorchSquashedGaussian, +) +from ray.rllib.models.torch.torch_action_dist import ( + TorchCategorical as OldTorchCategorical, + TorchDiagGaussian as OldTorchDiagGaussian, + TorchMultiActionDistribution as OldTorchMultiActionDistribution, + TorchMultiCategorical as OldTorchMultiCategorical, + TorchSquashedGaussian as OldTorchSquashedGaussian, +) +from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch + +torch, _ = try_import_torch() + + +class ModelV2ToRLModule(TorchRLModule, ValueFunctionAPI): + """An RLModule containing a (old stack) ModelV2. + + The `ModelV2` may be define either through + - an existing Policy checkpoint + - an existing Algorithm checkpoint (and a policy ID or "default_policy") + - or through an AlgorithmConfig object + + The ModelV2 is created in the `setup` and contines to live through the lifetime + of the RLModule. + """ + + @override(TorchRLModule) + def setup(self): + # Try extracting the policy ID from this RLModule's config dict. + policy_id = self.model_config.get("policy_id", DEFAULT_POLICY_ID) + + # Try getting the algorithm checkpoint from the `model_config`. + algo_checkpoint_dir = self.model_config.get("algo_checkpoint_dir") + if algo_checkpoint_dir: + algo_checkpoint_dir = pathlib.Path(algo_checkpoint_dir) + if not algo_checkpoint_dir.is_dir(): + raise ValueError( + "The `model_config` of your RLModule must contain a " + "`algo_checkpoint_dir` key pointing to the algo checkpoint " + "directory! You can find this dir inside the results dir of your " + "experiment. You can then add this path " + "through `config.rl_module(model_config={" + "'algo_checkpoint_dir': [your algo checkpoint dir]})`." + ) + policy_checkpoint_dir = algo_checkpoint_dir / "policies" / policy_id + # Try getting the policy checkpoint from the `model_config`. + else: + policy_checkpoint_dir = self.model_config.get("policy_checkpoint_dir") + + # Create the ModelV2 from the Policy. + if policy_checkpoint_dir: + policy_checkpoint_dir = pathlib.Path(policy_checkpoint_dir) + if not policy_checkpoint_dir.is_dir(): + raise ValueError( + "The `model_config` of your RLModule must contain a " + "`policy_checkpoint_dir` key pointing to the policy checkpoint " + "directory! You can find this dir under the Algorithm's checkpoint " + "dir in subdirectory: [algo checkpoint dir]/policies/[policy ID " + "ex. `default_policy`]. You can then add this path through `config" + ".rl_module(model_config={'policy_checkpoint_dir': " + "[your policy checkpoint dir]})`." + ) + # Create a temporary policy object. + policy = TorchPolicyV2.from_checkpoint(policy_checkpoint_dir) + # Create the ModelV2 from scratch using the config. + else: + config = self.model_config.get("old_api_stack_algo_config") + if not config: + raise ValueError( + "The `model_config` of your RLModule must contain a " + "`algo_config` key with a AlgorithmConfig object in it that " + "contains all the settings that would be necessary to construct a " + "old API stack Algorithm/Policy/ModelV2! You can add this setting " + "through `config.rl_module(model_config={'algo_config': " + "[your old config]})`." + ) + # Get the multi-agent policies dict. + policy_dict, _ = config.get_multi_agent_setup( + spaces={ + policy_id: (self.observation_space, self.action_space), + }, + default_policy_class=config.algo_class.get_default_policy_class(config), + ) + config = config.to_dict() + config["__policy_id"] = policy_id + policy = policy_dict[policy_id].policy_class( + self.observation_space, + self.action_space, + config, + ) + + self._model_v2 = policy.model + + # Translate the action dist classes from the old API stack to the new. + self.action_dist_class = self._translate_dist_class(policy.dist_class) + + # Erase the torch policy from memory, so it can be garbage collected. + del policy + + @override(TorchRLModule) + def _forward_inference(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]: + return self._forward_pass(batch, inference=True) + + @override(TorchRLModule) + def _forward_exploration(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]: + return self._forward_inference(batch, **kwargs) + + @override(TorchRLModule) + def _forward_train(self, batch: Dict[str, Any], **kwargs) -> Dict[str, Any]: + out = self._forward_pass(batch, inference=False) + out[Columns.ACTION_LOGP] = self.get_train_action_dist_cls()( + out[Columns.ACTION_DIST_INPUTS] + ).logp(batch[Columns.ACTIONS]) + out[Columns.VF_PREDS] = self._model_v2.value_function() + if Columns.STATE_IN in batch and Columns.SEQ_LENS in batch: + out[Columns.VF_PREDS] = torch.reshape( + out[Columns.VF_PREDS], [len(batch[Columns.SEQ_LENS]), -1] + ) + return out + + def _forward_pass(self, batch, inference=True): + # Translate states and seq_lens into old API stack formats. + batch = batch.copy() + state_in = batch.pop(Columns.STATE_IN, {}) + state_in = [s for i, s in sorted(state_in.items())] + seq_lens = batch.pop(Columns.SEQ_LENS, None) + + if state_in: + if inference and seq_lens is None: + seq_lens = torch.tensor( + [1.0] * state_in[0].shape[0], device=state_in[0].device + ) + elif not inference: + assert seq_lens is not None + # Perform the actual ModelV2 forward pass. + # A recurrent ModelV2 adds and removes the time-rank itself (whereas in the + # new API stack, the connector pipelines are responsible for doing this) -> + # We have to remove, then re-add the time rank here to make ModelV2 work. + batch = tree.map_structure( + lambda s: torch.reshape(s, [-1] + list(s.shape[2:])), batch + ) + nn_output, state_out = self._model_v2(batch, state_in, seq_lens) + # Put back 1ts time rank into nn-output (inference). + if state_in: + if inference: + nn_output = tree.map_structure( + lambda s: torch.unsqueeze(s, axis=1), nn_output + ) + else: + nn_output = tree.map_structure( + lambda s: torch.reshape(s, [len(seq_lens), -1] + list(s.shape[1:])), + nn_output, + ) + # Interpret the NN output as action logits. + output = {Columns.ACTION_DIST_INPUTS: nn_output} + # Add the `state_out` to the `output`, new API stack style. + if state_out: + output[Columns.STATE_OUT] = {} + for i, o in enumerate(state_out): + output[Columns.STATE_OUT][i] = o + + return output + + @override(ValueFunctionAPI) + def compute_values(self, batch: Dict[str, Any], embeddings: Optional[Any] = None): + self._forward_pass(batch, inference=False) + v_preds = self._model_v2.value_function() + if Columns.STATE_IN in batch and Columns.SEQ_LENS in batch: + v_preds = torch.reshape(v_preds, [len(batch[Columns.SEQ_LENS]), -1]) + return v_preds + + @override(TorchRLModule) + def get_initial_state(self): + """Converts the initial state list of ModelV2 into a dict (new API stack).""" + init_state_list = self._model_v2.get_initial_state() + return {i: s for i, s in enumerate(init_state_list)} + + def _translate_dist_class(self, old_dist_class): + map_ = { + OldTorchCategorical: TorchCategorical, + OldTorchDiagGaussian: TorchDiagGaussian, + OldTorchMultiActionDistribution: TorchMultiDistribution, + OldTorchMultiCategorical: TorchMultiCategorical, + OldTorchSquashedGaussian: TorchSquashedGaussian, + } + if old_dist_class not in map_: + raise ValueError( + f"ModelV2ToRLModule does NOT support {old_dist_class} action " + f"distributions yet!" + ) + + return map_[old_dist_class] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/random_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/random_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..e35292e212cfb90731eb188770222bf388f7d45a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/random_rlm.py @@ -0,0 +1,71 @@ +import gymnasium as gym +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module import RLModule +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.spaces.space_utils import batch as batch_func + + +class RandomRLModule(RLModule): + @override(RLModule) + def _forward(self, batch, **kwargs): + obs_batch_size = len(tree.flatten(batch[SampleBatch.OBS])[0]) + actions = batch_func( + [self.action_space.sample() for _ in range(obs_batch_size)] + ) + return {SampleBatch.ACTIONS: actions} + + @override(RLModule) + def _forward_train(self, *args, **kwargs): + # RandomRLModule should always be configured as non-trainable. + # To do so, set in your config: + # `config.multi_agent(policies_to_train=[list of ModuleIDs to be trained, + # NOT including the ModuleID of this RLModule])` + raise NotImplementedError("Random RLModule: Should not be trained!") + + @override(RLModule) + def output_specs_inference(self): + return [SampleBatch.ACTIONS] + + @override(RLModule) + def output_specs_exploration(self): + return [SampleBatch.ACTIONS] + + def compile(self, *args, **kwargs): + """Dummy method for compatibility with TorchRLModule. + + This is hit when RolloutWorker tries to compile TorchRLModule.""" + pass + + +class StatefulRandomRLModule(RandomRLModule): + """A stateful RLModule that returns STATE_OUT from its forward methods. + + - Implements the `get_initial_state` method (returning a all-zeros dummy state). + - Returns a dummy state under the `Columns.STATE_OUT` from its forward methods. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._internal_state_space = gym.spaces.Box(-1.0, 1.0, (1,)) + + @override(RLModule) + def get_initial_state(self): + return { + "state": np.zeros_like([self._internal_state_space.sample()]), + } + + def _random_forward(self, batch, **kwargs): + batch = super()._random_forward(batch, **kwargs) + batch[Columns.STATE_OUT] = { + "state": batch_func( + [ + self._internal_state_space.sample() + for _ in range(len(batch[Columns.ACTIONS])) + ] + ), + } + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/rock_paper_scissors_heuristic_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/rock_paper_scissors_heuristic_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..f4b3d661f4de3e970979d4c2c42b7b7464932ecd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/rock_paper_scissors_heuristic_rlm.py @@ -0,0 +1,108 @@ +from collections import defaultdict + +import numpy as np + +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override + + +class AlwaysSameHeuristicRLM(RLModule): + """In rock-paper-scissors, always chooses the same action within an episode. + + The first move is random, all the following moves are the same as the first one. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._actions_per_vector_idx = defaultdict(int) + + @override(RLModule) + def _forward_inference(self, batch, **kwargs): + ret = [] + # Note that the obs is the previous move of the opponens (0-2). If it's 3, it + # means that there was no previous move and thus, the episode just started. + for i, obs in enumerate(batch[Columns.OBS]): + if obs == 3: + self._actions_per_vector_idx[i] = np.random.choice([0, 1, 2]) + ret.append(self._actions_per_vector_idx[i]) + return {Columns.ACTIONS: np.array(ret)} + + @override(RLModule) + def _forward_exploration(self, batch, **kwargs): + return self._forward_inference(batch, **kwargs) + + @override(RLModule) + def _forward_train(self, batch, **kwargs): + raise NotImplementedError( + "AlwaysSameHeuristicRLM is not trainable! Make sure you do NOT include it " + "in your `config.multi_agent(policies_to_train={...})` set." + ) + + @override(RLModule) + def output_specs_inference(self): + return [Columns.ACTIONS] + + @override(RLModule) + def output_specs_exploration(self): + return [Columns.ACTIONS] + + +class BeatLastHeuristicRLM(RLModule): + """In rock-paper-scissors, always acts such that it beats prev. move of opponent. + + The first move is random. + + For example, after opponent played `rock` (and this policy made a random + move), the next move would be `paper`(to beat `rock`). + """ + + @override(RLModule) + def _forward_inference(self, batch, **kwargs): + """Returns the exact action that would beat the previous action of the opponent. + + The opponent's previous action is the current observation for this agent. + + Both action- and observation spaces are discrete. There are 3 actions available. + (0-2) and 4 observations (0-2 plus 3, where 3 is the observation after the env + reset, when no action has been taken yet). Thereby: + 0=Rock + 1=Paper + 2=Scissors + 3=[after reset] (observation space only) + """ + return { + Columns.ACTIONS: np.array( + [self._pick_single_action(obs) for obs in batch[Columns.OBS]] + ), + } + + @override(RLModule) + def _forward_exploration(self, batch, **kwargs): + return self._forward_inference(batch, **kwargs) + + @override(RLModule) + def _forward_train(self, batch, **kwargs): + raise NotImplementedError( + "BeatLastHeuristicRLM is not trainable! Make sure you do NOT include it in " + "your `config.multi_agent(policies_to_train={...})` set." + ) + + @override(RLModule) + def output_specs_inference(self): + return [Columns.ACTIONS] + + @override(RLModule) + def output_specs_exploration(self): + return [Columns.ACTIONS] + + @staticmethod + def _pick_single_action(prev_opponent_obs): + if prev_opponent_obs == 0: + return 1 + elif prev_opponent_obs == 1: + return 2 + elif prev_opponent_obs == 2: + return 0 + else: + return np.random.choice([0, 1, 2]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..eb2e4e39b56a6f3963a247cf1969f92309970269 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/tiny_atari_cnn_rlm.py @@ -0,0 +1,194 @@ +from typing import Any, Dict, Optional + +from ray.rllib.core.columns import Columns +from ray.rllib.core.learner.utils import make_target_network +from ray.rllib.core.rl_module.apis import ( + TargetNetworkAPI, + ValueFunctionAPI, + TARGET_NETWORK_ACTION_DIST_INPUTS, +) +from ray.rllib.core.rl_module.torch import TorchRLModule +from ray.rllib.models.torch.misc import ( + normc_initializer, + same_padding, + valid_padding, +) +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + + +class TinyAtariCNN(TorchRLModule, ValueFunctionAPI, TargetNetworkAPI): + """A tiny CNN stack for fast-learning of Atari envs. + + The architecture here is the exact same as the one used by the old API stack as + CNN default ModelV2. + + We stack 3 CNN layers based on the config, then a 4th one with linear activation + and n 1x1 filters, where n is the number of actions in the (discrete) action space. + Simple reshaping (no flattening or extra linear layers necessary) lead to the + action logits, which can directly be used inside a distribution or loss. + + .. testcode:: + + import numpy as np + import gymnasium as gym + + my_net = TinyAtariCNN( + observation_space=gym.spaces.Box(-1.0, 1.0, (42, 42, 4), np.float32), + action_space=gym.spaces.Discrete(4), + ) + + B = 10 + w = 42 + h = 42 + c = 4 + data = torch.from_numpy( + np.random.random_sample(size=(B, w, h, c)).astype(np.float32) + ) + print(my_net.forward_inference({"obs": data})) + print(my_net.forward_exploration({"obs": data})) + print(my_net.forward_train({"obs": data})) + + num_all_params = sum(int(np.prod(p.size())) for p in my_net.parameters()) + print(f"num params = {num_all_params}") + """ + + @override(TorchRLModule) + def setup(self): + """Use this method to create all the model components that you require. + + Feel free to access the following useful properties in this class: + - `self.model_config`: The config dict for this RLModule class, + which should contain flxeible settings, for example: {"hiddens": [256, 256]}. + - `self.observation|action_space`: The observation and action space that + this RLModule is subject to. Note that the observation space might not be the + exact space from your env, but that it might have already gone through + preprocessing through a connector pipeline (for example, flattening, + frame-stacking, mean/std-filtering, etc..). + """ + # Get the CNN stack config from our RLModuleConfig's (self.config) + # `model_config` property: + conv_filters = self.model_config.get("conv_filters") + # Default CNN stack with 3 layers: + if conv_filters is None: + conv_filters = [ + [16, 4, 2, "same"], # num filters, kernel wxh, stride wxh, padding type + [32, 4, 2, "same"], + [256, 11, 1, "valid"], + ] + + # Build the CNN layers. + layers = [] + + # Add user-specified hidden convolutional layers first + width, height, in_depth = self.observation_space.shape + in_size = [width, height] + for filter_specs in conv_filters: + if len(filter_specs) == 4: + out_depth, kernel_size, strides, padding = filter_specs + else: + out_depth, kernel_size, strides = filter_specs + padding = "same" + + # Pad like in tensorflow's SAME mode. + if padding == "same": + padding_size, out_size = same_padding(in_size, kernel_size, strides) + layers.append(nn.ZeroPad2d(padding_size)) + # No actual padding is performed for "valid" mode, but we will still + # compute the output size (input for the next layer). + else: + out_size = valid_padding(in_size, kernel_size, strides) + + layer = nn.Conv2d(in_depth, out_depth, kernel_size, strides, bias=True) + # Initialize CNN layer kernel and bias. + nn.init.xavier_uniform_(layer.weight) + nn.init.zeros_(layer.bias) + layers.append(layer) + # Activation. + layers.append(nn.ReLU()) + + in_size = out_size + in_depth = out_depth + + self._base_cnn_stack = nn.Sequential(*layers) + + # Add the final CNN 1x1 layer with num_filters == num_actions to be reshaped to + # yield the logits (no flattening, no additional linear layers required). + _final_conv = nn.Conv2d(in_depth, self.action_space.n, 1, 1, bias=True) + nn.init.xavier_uniform_(_final_conv.weight) + nn.init.zeros_(_final_conv.bias) + self._logits = nn.Sequential( + nn.ZeroPad2d(same_padding(in_size, 1, 1)[0]), _final_conv + ) + + self._values = nn.Linear(in_depth, 1) + # Mimick old API stack behavior of initializing the value function with `normc` + # std=0.01. + normc_initializer(0.01)(self._values.weight) + + @override(TorchRLModule) + def _forward(self, batch, **kwargs): + # Compute the basic 1D feature tensor (inputs to policy- and value-heads). + _, logits = self._compute_embeddings_and_logits(batch) + # Return features and logits as ACTION_DIST_INPUTS (categorical distribution). + return { + Columns.ACTION_DIST_INPUTS: logits, + } + + @override(TorchRLModule) + def _forward_train(self, batch, **kwargs): + # Compute the basic 1D feature tensor (inputs to policy- and value-heads). + embeddings, logits = self._compute_embeddings_and_logits(batch) + # Return features and logits as ACTION_DIST_INPUTS (categorical distribution). + return { + Columns.ACTION_DIST_INPUTS: logits, + Columns.EMBEDDINGS: embeddings, + } + + # We implement this RLModule as a TargetNetworkAPI RLModule, so it can be used + # by the APPO algorithm. + @override(TargetNetworkAPI) + def make_target_networks(self) -> None: + self._target_base_cnn_stack = make_target_network(self._base_cnn_stack) + self._target_logits = make_target_network(self._logits) + + @override(TargetNetworkAPI) + def get_target_network_pairs(self): + return [ + (self._base_cnn_stack, self._target_base_cnn_stack), + (self._logits, self._target_logits), + ] + + @override(TargetNetworkAPI) + def forward_target(self, batch, **kw): + obs = batch[Columns.OBS].permute(0, 3, 1, 2) + embeddings = self._target_base_cnn_stack(obs) + logits = self._target_logits(embeddings) + return {TARGET_NETWORK_ACTION_DIST_INPUTS: torch.squeeze(logits, dim=[-1, -2])} + + # We implement this RLModule as a ValueFunctionAPI RLModule, so it can be used + # by value-based methods like PPO or IMPALA. + @override(ValueFunctionAPI) + def compute_values( + self, + batch: Dict[str, Any], + embeddings: Optional[Any] = None, + ) -> TensorType: + # Features not provided -> We need to compute them first. + if embeddings is None: + obs = batch[Columns.OBS] + embeddings = self._base_cnn_stack(obs.permute(0, 3, 1, 2)) + embeddings = torch.squeeze(embeddings, dim=[-1, -2]) + return self._values(embeddings).squeeze(-1) + + def _compute_embeddings_and_logits(self, batch): + obs = batch[Columns.OBS].permute(0, 3, 1, 2) + embeddings = self._base_cnn_stack(obs) + logits = self._logits(embeddings) + return ( + torch.squeeze(embeddings, dim=[-1, -2]), + torch.squeeze(logits, dim=[-1, -2]), + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_torch_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_torch_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..676598d090dc01f3cc4c99089a2345fc15eebbc0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_torch_rlm.py @@ -0,0 +1,50 @@ +import torch + +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.torch import TorchRLModule + + +class VPGTorchRLModule(TorchRLModule): + """A simple VPG (vanilla policy gradient)-style RLModule for testing purposes. + + Use this as a minimum, bare-bones example implementation of a custom TorchRLModule. + """ + + def setup(self): + # You have access here to the following already set attributes: + # self.observation_space + # self.action_space + # self.inference_only + # self.model_config # <- a dict with custom settings + input_dim = self.observation_space.shape[0] + hidden_dim = self.model_config["hidden_dim"] + output_dim = self.action_space.n + + self._policy_net = torch.nn.Sequential( + torch.nn.Linear(input_dim, hidden_dim), + torch.nn.ReLU(), + torch.nn.Linear(hidden_dim, output_dim), + ) + + def _forward(self, batch, **kwargs): + # Push the observations from the batch through our pi-head. + action_logits = self._policy_net(batch[Columns.OBS]) + # Return parameters for the (default) action distribution, which is + # `TorchCategorical` (due to our action space being `gym.spaces.Discrete`). + return {Columns.ACTION_DIST_INPUTS: action_logits} + + # If you need more granularity between the different forward behaviors during + # the different phases of the module's lifecycle, implement three different + # forward methods. Thereby, it is recommended to put the inference and + # exploration versions inside a `with torch.no_grad()` context for better + # performance. + # def _forward_train(self, batch): + # ... + # + # def _forward_inference(self, batch): + # with torch.no_grad(): + # return self._forward_train(batch) + # + # def _forward_exploration(self, batch): + # with torch.no_grad(): + # return self._forward_train(batch) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py new file mode 100644 index 0000000000000000000000000000000000000000..471df1045ea3893065008c7e5772fe583d28ae4f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/classes/vpg_using_shared_encoder_rlm.py @@ -0,0 +1,170 @@ +import torch + +from ray.rllib.core import Columns +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModule +from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule + + +SHARED_ENCODER_ID = "shared_encoder" + + +# __sphinx_doc_policy_begin__ +class VPGPolicyAfterSharedEncoder(TorchRLModule): + """A VPG (vanilla pol. gradient)-style RLModule using a shared encoder. + # __sphinx_doc_policy_end__ + + The shared encoder RLModule must be held by the same MultiRLModule, under which + this RLModule resides. The shared encoder's forward is called before this + RLModule's forward and returns the embeddings under the "encoder_embeddings" + key. + # __sphinx_doc_policy_2_begin__ + """ + + def setup(self): + super().setup() + + # Incoming feature dim from the shared encoder. + embedding_dim = self.model_config["embedding_dim"] + hidden_dim = self.model_config["hidden_dim"] + + self._pi_head = torch.nn.Sequential( + torch.nn.Linear(embedding_dim, hidden_dim), + torch.nn.ReLU(), + torch.nn.Linear(hidden_dim, self.action_space.n), + ) + + def _forward(self, batch, **kwargs): + # Embeddings can be found in the batch under the "encoder_embeddings" key. + embeddings = batch["encoder_embeddings"] + logits = self._pi_head(embeddings) + return {Columns.ACTION_DIST_INPUTS: logits} + + +# __sphinx_doc_policy_2_end__ + + +# __sphinx_doc_mrlm_begin__ +class VPGMultiRLModuleWithSharedEncoder(MultiRLModule): + """VPG (vanilla pol. gradient)-style MultiRLModule handling a shared encoder. + # __sphinx_doc_mrlm_end__ + + This MultiRLModule needs to be configured appropriately as follows: + + .. testcode:: + + # __sphinx_doc_how_to_run_begin__ + import gymnasium as gym + from ray.rllib.algorithms.ppo import PPOConfig + from ray.rllib.core import MultiRLModuleSpec, RLModuleSpec + from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole + + single_agent_env = gym.make("CartPole-v1") + + EMBEDDING_DIM = 64 # encoder output dim + + config = ( + PPOConfig() + .environment(MultiAgentCartPole, env_config={"num_agents": 2}) + .multi_agent( + # Declare the two policies trained. + policies={"p0", "p1"}, + # Agent IDs of `MultiAgentCartPole` are 0 and 1. They are mapped to + # the two policies with ModuleIDs "p0" and "p1", respectively. + policy_mapping_fn=lambda agent_id, episode, **kw: f"p{agent_id}" + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={ + # Shared encoder. + SHARED_ENCODER_ID: RLModuleSpec( + module_class=SharedEncoder, + model_config={"embedding_dim": EMBEDDING_DIM}, + observation_space=single_agent_env.observation_space, + ), + # Large policy net. + "p0": RLModuleSpec( + module_class=VPGPolicyAfterSharedEncoder, + model_config={ + "embedding_dim": EMBEDDING_DIM, + "hidden_dim": 1024, + }, + ), + # Small policy net. + "p1": RLModuleSpec( + module_class=VPGPolicyAfterSharedEncoder, + model_config={ + "embedding_dim": EMBEDDING_DIM, + "hidden_dim": 64, + }, + ), + }, + ), + ) + ) + algo = config.build() + print(algo.get_module()) + # __sphinx_doc_how_to_run_end__ + + Also note that in order to learn properly, a special, multi-agent Learner + accounting for the shared encoder must be setup. This Learner should have only + one optimizer (used to train all submodules: encoder and the n policy nets) in + order to not destabilize learning. The latter would happen if more than one + optimizer would try to alternatingly optimize the same shared encoder submodule. + # __sphinx_doc_mrlm_2_begin__ + """ + + def setup(self): + # Call the super's setup(). + super().setup() + + # Assert, we have the shared encoder submodule. + assert ( + SHARED_ENCODER_ID in self._rl_modules + and isinstance(self._rl_modules[SHARED_ENCODER_ID], SharedEncoder) + and len(self._rl_modules) > 1 + ) + # Assign the encoder to a convenience attribute. + self.encoder = self._rl_modules[SHARED_ENCODER_ID] + + def _forward(self, batch, **kwargs): + # Collect our policies' outputs in this dict. + outputs = {} + + # Loop through the policy nets (through the given batch's keys). + for policy_id, policy_batch in batch.items(): + rl_module = self._rl_modules[policy_id] + + # Pass policy's observations through shared encoder to get the features for + # this policy. + policy_batch["encoder_embeddings"] = self.encoder._forward(batch[policy_id]) + + # Pass the policy's embeddings through the policy net. + outputs[policy_id] = rl_module._forward(batch[policy_id], **kwargs) + + return outputs + + +# __sphinx_doc_mrlm_2_end__ + + +# __sphinx_doc_encoder_begin__ +class SharedEncoder(TorchRLModule): + """A shared encoder that can be used with `VPGMultiRLModuleWithSharedEncoder`.""" + + def setup(self): + super().setup() + + input_dim = self.observation_space.shape[0] + embedding_dim = self.model_config["embedding_dim"] + + # A very simple encoder network. + self._net = torch.nn.Sequential( + torch.nn.Linear(input_dim, embedding_dim), + ) + + def _forward(self, batch, **kwargs): + # Pass observations through the net and return outputs. + return {"encoder_embeddings": self._net(batch[Columns.OBS])} + + +# __sphinx_doc_encoder_end__ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_cnn_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_cnn_rl_module.py new file mode 100644 index 0000000000000000000000000000000000000000..4001f3e21d6b8b09793f19355f0e7ae177634112 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_cnn_rl_module.py @@ -0,0 +1,120 @@ +"""Example of implementing and configuring a custom (torch) CNN containing RLModule. + +This example: + - demonstrates how you can subclass the TorchRLModule base class and set up your + own CNN-stack architecture by overriding the `setup()` method. + - shows how to override the 3 forward methods: `_forward_inference()`, + `_forward_exploration()`, and `forward_train()` to implement your own custom forward + logic(s). You will also learn, when each of these 3 methods is called by RLlib or + the users of your RLModule. + - shows how you then configure an RLlib Algorithm such that it uses your custom + RLModule (instead of a default RLModule). + +We implement a tiny CNN stack here, the exact same one that is used by the old API +stack as default CNN net. It comprises 4 convolutional layers, the last of which +ends in a 1x1 filter size and the number of filters exactly matches the number of +discrete actions (logits). This way, the (non-activated) output of the last layer only +needs to be reshaped in order to receive the policy's logit outputs. No flattening +or additional dense layer required. + +The network is then used in a fast ALE/Pong-v5 experiment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see the following output (during the experiment) in your console: + +Number of trials: 1/1 (1 RUNNING) ++---------------------+----------+----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+----------+----------------+--------+------------------+ +| PPO_env_82b44_00000 | RUNNING | 127.0.0.1:9718 | 1 | 98.3585 | ++---------------------+----------+----------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| num_env_steps_sample | num_env_steps_traine | num_episodes_lifetim | +| d_lifetime | d_lifetime | e | +|------------------------+------------------------+------------------------| +| 4000 | 4000 | 4 | ++------------------------+------------------------+------------------------+ +""" +import gymnasium as gym + +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.examples.rl_modules.classes.tiny_atari_cnn_rlm import TinyAtariCNN +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +parser = add_rllib_example_script_args(default_iters=100, default_timesteps=600000) +parser.set_defaults( + enable_new_api_stack=True, + env="ale_py:ALE/Pong-v5", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + register_env( + "env", + lambda cfg: wrap_atari_for_new_api_stack( + gym.make(args.env, **cfg), + dim=42, # <- need images to be "tiny" for our custom model + framestack=4, + ), + ) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + env="env", + env_config=dict( + frameskip=1, + full_action_space=False, + repeat_action_probability=0.0, + ), + ) + .rl_module( + # Plug-in our custom RLModule class. + rl_module_spec=RLModuleSpec( + module_class=TinyAtariCNN, + # Feel free to specify your own `model_config` settings below. + # The `model_config` defined here will be available inside your + # custom RLModule class through the `self.model_config` + # property. + model_config={ + "conv_filters": [ + # num filters, kernel wxh, stride wxh, padding type + [16, 4, 2, "same"], + [32, 4, 2, "same"], + [256, 11, 1, "valid"], + ], + }, + ), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_lstm_rl_module.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_lstm_rl_module.py new file mode 100644 index 0000000000000000000000000000000000000000..85b160808bd7e6405469a75389377bff60c9b7cb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/custom_lstm_rl_module.py @@ -0,0 +1,102 @@ +"""Example of implementing and configuring a custom (torch) LSTM containing RLModule. + +This example: + - demonstrates how you can subclass the TorchRLModule base class and set up your + own LSTM-containing NN architecture by overriding the `setup()` method. + - shows how to override the 3 forward methods: `_forward_inference()`, + `_forward_exploration()`, and `forward_train()` to implement your own custom forward + logic(s), including how to handle STATE in- and outputs to and from these calls. + - explains when each of these 3 methods is called by RLlib or the users of your + RLModule. + - shows how you then configure an RLlib Algorithm such that it uses your custom + RLModule (instead of a default RLModule). + +We implement a simple LSTM layer here, followed by a series of Linear layers. +After the last Linear layer, we add fork of 2 Linear (non-activated) layers, one for the +action logits and one for the value function output. + +We test the LSTM containing RLModule on the StatelessCartPole environment, a variant +of CartPole that is non-Markovian (partially observable). Only an RNN-network can learn +a decent policy in this environment due to the lack of any velocity information. By +looking at one observation, one cannot know whether the cart is currently moving left or +right and whether the pole is currently moving up or down). + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see the following output (during the experiment) in your console: + +""" +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole +from ray.rllib.examples.rl_modules.classes.lstm_containing_rlm import ( + LSTMContainingRLModule, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +parser = add_rllib_example_script_args( + default_reward=300.0, + default_timesteps=2000000, +) +parser.set_defaults(enable_new_api_stack=True) + + +if __name__ == "__main__": + args = parser.parse_args() + + if args.num_agents == 0: + register_env("env", lambda cfg: StatelessCartPole()) + else: + register_env("env", lambda cfg: MultiAgentStatelessCartPole(cfg)) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + env="env", + env_config={"num_agents": args.num_agents}, + ) + .training( + train_batch_size_per_learner=1024, + num_epochs=6, + lr=0.0009, + vf_loss_coeff=0.001, + entropy_coeff=0.0, + ) + .rl_module( + # Plug-in our custom RLModule class. + rl_module_spec=RLModuleSpec( + module_class=LSTMContainingRLModule, + # Feel free to specify your own `model_config` settings below. + # The `model_config` defined here will be available inside your + # custom RLModule class through the `self.model_config` + # property. + model_config={ + "lstm_cell_size": 256, + "dense_layers": [256, 256], + "max_seq_len": 20, + }, + ), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py new file mode 100644 index 0000000000000000000000000000000000000000..21b68184051f7be95d5ea06ca80ec1032cbd998c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_config.py @@ -0,0 +1,77 @@ +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core import DEFAULT_POLICY_ID +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.rl_modules.classes.modelv2_to_rlm import ModelV2ToRLModule +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, +) + + +if __name__ == "__main__": + # Configure an old stack default ModelV2. + config_old_stack = ( + PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .environment("CartPole-v1") + .training( + lr=0.0003, + num_sgd_iter=6, + vf_loss_coeff=0.01, + # Change the ModelV2 settings a bit. + model={ + "fcnet_hiddens": [32], + "fcnet_activation": "linear", + "use_lstm": True, + "vf_share_layers": True, + }, + ) + ) + + # Training with the (configured and wrapped) ModelV2. + + # We change the original (old API stack) `config` into a new API stack one: + config_new_stack = ( + config_old_stack.copy(copy_frozen=False) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .rl_module( + rl_module_spec=RLModuleSpec( + module_class=ModelV2ToRLModule, + model_config={ + "policy_id": DEFAULT_POLICY_ID, + "old_api_stack_algo_config": config_old_stack, + "max_seq_len": 20, + }, + ), + ) + ) + + # Build the new stack algo. + algo_new_stack = config_new_stack.build() + + # Train until a higher return. + min_return_new_stack = 350.0 + results = None + passed = False + for i in range(100): + results = algo_new_stack.train() + print(results) + if results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_return_new_stack: + print( + f"Reached episode return of {min_return_new_stack} -> stopping " + "new API stack training." + ) + passed = True + break + + if not passed: + raise ValueError( + "Continuing training on the new stack did not succeed! Last return: " + f"{results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..ac6ef471cb9552e6e1307c60c73db65060f759af --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/migrate_modelv2_to_new_api_stack_by_policy_checkpoint.py @@ -0,0 +1,118 @@ +import pathlib + +import gymnasium as gym +import numpy as np +import torch + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.rl_modules.classes.modelv2_to_rlm import ModelV2ToRLModule +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, +) +from ray.rllib.utils.spaces.space_utils import batch + + +if __name__ == "__main__": + # Configure and train an old stack default ModelV2. + config = ( + PPOConfig() + # Old API stack. + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .environment("CartPole-v1") + .training( + lr=0.0003, + num_epochs=6, + vf_loss_coeff=0.01, + ) + ) + algo_old_stack = config.build() + + min_return_old_stack = 100.0 + while True: + results = algo_old_stack.train() + print(results) + if results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_return_old_stack: + print( + f"Reached episode return of {min_return_old_stack} -> stopping " + "old API stack training." + ) + break + + checkpoint = algo_old_stack.save() + policy_path = ( + pathlib.Path(checkpoint.checkpoint.path) / "policies" / "default_policy" + ) + assert policy_path.is_dir() + algo_old_stack.stop() + + print("done") + + # Move the old API stack (trained) ModelV2 into the new API stack's RLModule. + # Run a simple CartPole inference experiment. + env = gym.make("CartPole-v1", render_mode="human") + rl_module = ModelV2ToRLModule( + observation_space=env.observation_space, + action_space=env.action_space, + model_config={"policy_checkpoint_dir": policy_path}, + ) + + obs, _ = env.reset() + env.render() + done = False + episode_return = 0.0 + while not done: + output = rl_module.forward_inference({"obs": torch.from_numpy(batch([obs]))}) + action_logits = output["action_dist_inputs"].detach().numpy()[0] + action = np.argmax(action_logits) + obs, reward, terminated, truncated, _ = env.step(action) + done = terminated or truncated + episode_return += reward + env.render() + + print(f"Ran episode with trained ModelV2: return={episode_return}") + + # Continue training with the (checkpointed) ModelV2. + + # We change the original (old API stack) `config` into a new API stack one: + config = config.api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ).rl_module( + rl_module_spec=RLModuleSpec( + module_class=ModelV2ToRLModule, + model_config={"policy_checkpoint_dir": policy_path}, + ), + ) + + # Build the new stack algo. + algo_new_stack = config.build() + + # Train until a higher return. + min_return_new_stack = 450.0 + passed = False + for i in range(50): + results = algo_new_stack.train() + print(results) + # Make sure that the model's weights from the old API stack training + # were properly transferred to the new API RLModule wrapper. Thus, even + # after only one iteration of new stack training, we already expect the + # return to be higher than it was at the end of the old stack training. + assert results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_return_old_stack + if results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_return_new_stack: + print( + f"Reached episode return of {min_return_new_stack} -> stopping " + "new API stack training." + ) + passed = True + break + + if not passed: + raise ValueError( + "Continuing training on the new stack did not succeed! Last return: " + f"{results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..bbb35184d7ddc781be5fe56d231b6b00614ce5c4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent.py @@ -0,0 +1,194 @@ +"""Example of running a single-agent pre-training followed with a multi-agent training. + +This examples `num_agents` agents each of them with its own `RLModule` that defines its +policy. The first agent is pre-trained using a single-agent PPO algorithm. All agents +are trained together in the main training run using a multi-agent PPO algorithm where +the pre-trained module is used for the first agent. + +The environment is MultiAgentCartPole, in which there are n agents both policies. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see that the single-agent policy is first trained until +the specified `--stop-reward-pretraining` value. For example, with the command line: +`--enable-new-api-stack --num-agents=2 --stop-reward-pretraining=250.0 +--stop-reward=250.0 --stop-iters=3 --as-test`, you should get something like: ++-----------------------+------------+------+----------------+---------------------+ +| Trial name | status | iter | total time (s) | episode_return_mean | +| | | | | | +|-----------------------+------------+------+----------------+---------------------+ +| PPO_CartPole-v1_00000 | TERMINATED | 16 | 25.6009 | 256.2 | ++-----------------------+------------+------+----------------+---------------------+ + +Then, in the second experiment, where we run in a multi-agent setup with two policies +("p0" from the single-agent checkpoint and "p1" randomly initialized), you can see that +only "p0" immediately (after 1 iteration) reaches the same episode return as at the end +of pretraining: ++----------------------------+------------+--------+------------------+------+ +| Trial name | status | iter | total time (s) | ts | +|----------------------------+------------+--------+------------------+------+ +| PPO_multi-cart_6274d_00000 | TERMINATED | 1 | 2.71681 | 4000 | ++----------------------------+------------+--------+------------------+------+ ++-------------------+-------------+-------------+ +| combined return | return p0 | return p1 | +|-------------------+-------------|-------------+ +| 451.625 | 433.125 | 18.5 | ++-------------------+-------------+-------------+ +""" +from pathlib import Path + +import gymnasium as gym + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog +from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule +from ray.rllib.core import ( + COMPONENT_LEARNER, + COMPONENT_LEARNER_GROUP, + COMPONENT_RL_MODULE, + DEFAULT_MODULE_ID, +) +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune import register_env + +# Read in common example script command line arguments. +parser = add_rllib_example_script_args( + # Use less training steps for the main training run. + default_timesteps=50000, + default_reward=200.0, + default_iters=20, +) +parser.set_defaults( + checkpoint_freq=1, + checkpoint_at_end=True, +) +parser.add_argument( + "--stop-reward-pretraining", + type=float, + default=250.0, + help="The min. episode return to reach during pre-training.", +) + + +if __name__ == "__main__": + + # Parse the command line arguments. + args = parser.parse_args() + + # Ensure that the user has set the number of agents. + if args.num_agents == 0: + raise ValueError( + "This pre-training example script requires at least 1 agent. " + "Try setting the command line argument `--num-agents` to the " + "number of agents you want to use." + ) + + # Store the user's stopping criteria for the later training run. + stop_iters = args.stop_iters + stop_timesteps = args.stop_timesteps + stop_reward = args.stop_reward + num_agents = args.num_agents + as_test = args.as_test + + # Override these criteria for the pre-training run. + args.stop_iters = 10000 + args.stop_timesteps = 100000000 + args.stop_reward = args.stop_reward_pretraining + args.num_agents = 0 + args.as_test = False + + # Define out pre-training single-agent algorithm. We will use the same module + # configuration for the pre-training and the training. + config = ( + PPOConfig() + .environment("CartPole-v1") + .rl_module( + # Use a different number of hidden units for the pre-trained module. + model_config=DefaultModelConfig(fcnet_hiddens=[64]), + ) + ) + + # Run the pre-training. + results = run_rllib_example_script_experiment(config, args, keep_ray_up=True) + # Get the checkpoint path. + module_chkpt_path = ( + Path(results.get_best_result().checkpoint.path) + / COMPONENT_LEARNER_GROUP + / COMPONENT_LEARNER + / COMPONENT_RL_MODULE + / DEFAULT_MODULE_ID + ) + assert module_chkpt_path.is_dir() + + # Restore the user's stopping criteria for the training run. + args.stop_iters = stop_iters + args.stop_timesteps = stop_timesteps + args.stop_reward = stop_reward + args.num_agents = num_agents + args.as_test = as_test + + # Create a new MultiRLModule using the pre-trained module for policy 0. + env = gym.make("CartPole-v1") + module_specs = {} + module_class = PPOTorchRLModule + for i in range(args.num_agents): + module_specs[f"p{i}"] = RLModuleSpec( + module_class=PPOTorchRLModule, + observation_space=env.observation_space, + action_space=env.action_space, + model_config=DefaultModelConfig(fcnet_hiddens=[32]), + catalog_class=PPOCatalog, + ) + + # Swap in the pre-trained module for policy 0. + module_specs["p0"] = RLModuleSpec( + module_class=PPOTorchRLModule, + observation_space=env.observation_space, + action_space=env.action_space, + model_config=DefaultModelConfig(fcnet_hiddens=[64]), + catalog_class=PPOCatalog, + # Note, we load here the module directly from the checkpoint. + load_state_path=module_chkpt_path, + ) + multi_rl_module_spec = MultiRLModuleSpec(rl_module_specs=module_specs) + + # Register our environment with tune if we use multiple agents. + register_env( + "multi-cart", + lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}), + ) + + # Configure the main (multi-agent) training run. + config = ( + PPOConfig() + .environment("multi-cart") + .multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, eps, **kw: f"p{aid}", + ) + .rl_module(rl_module_spec=multi_rl_module_spec) + ) + + # Run the main training run. + run_rllib_example_script_experiment(config, args)