diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b9fcf62061fcd127d82e39682683ec9eff3b441 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2cbc69685b54de79d1648106ea6eaaa544a2e1b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0aa8e2197aad66c093af34b193dacb2d2d5f885 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04fb7a71026ce236f71b4d07e5b17ce775801870 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f0995f945759e04fb54548f80e15d86574143de Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0fdab5667db51eaf83de541baf54899cab36d68 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e868f479d3977ea062144e6f35bfc0cddc165716 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f10bb65798fb819a14e6bd06c3e935db6480acc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py new file mode 100644 index 0000000000000000000000000000000000000000..abb9f21c3333e9ce75249104a1f18457d517cbfb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py @@ -0,0 +1,109 @@ +"""Example on how to define and run with an RLModule with a dependent action space. + +This examples: + - Shows how to write a custom RLModule outputting autoregressive actions. + The RLModule class used here implements a prior distribution for the first couple + of actions and then uses the sampled actions to compute the parameters for and + sample from a posterior distribution. + - Shows how to configure a PPO algorithm to use the custom RLModule. + - Stops the training after 100k steps or when the mean episode return + exceeds -0.012 in evaluation, i.e. if the agent has learned to + synchronize its actions. + +For details on the environment used, take a look at the `CorrelatedActionsEnv` +class. To receive an episode return over 100, the agent must learn how to synchronize +its actions. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-env-runners 2` + +Control the number of `EnvRunner`s with the `--num-env-runners` flag. This +will increase the sampling speed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should reach an episode return of better than -0.5 quickly through a simple PPO +policy. The logic behind beating the env is roughly: + +OBS: optimal a1: r1: optimal a2: r2: +-1 2 0 -1.0 0 +-0.5 1/2 -0.5 -0.5/-1.5 0 +0 1 0 -1.0 0 +0.5 0/1 -0.5 -0.5/-1.5 0 +1 0 0 -1.0 0 + +Meaning, most of the time, you would receive a reward better than -0.5, but worse than +0.0. + ++--------------------------------------+------------+--------+------------------+ +| Trial name | status | iter | total time (s) | +| | | | | +|--------------------------------------+------------+--------+------------------+ +| PPO_CorrelatedActionsEnv_6660d_00000 | TERMINATED | 76 | 132.438 | ++--------------------------------------+------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| episode_return_mean | num_env_steps_sample | ...env_steps_sampled | +| | d_lifetime | _lifetime_throughput | +|------------------------+------------------------+------------------------| +| -0.43 | 152000 | 1283.48 | ++------------------------+------------------------+------------------------+ +""" + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv +from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import ( + AutoregressiveActionsRLM, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + + +parser = add_rllib_example_script_args( + default_iters=1000, + default_timesteps=2000000, + default_reward=-0.45, +) +parser.set_defaults(enable_new_api_stack=True) + + +if __name__ == "__main__": + args = parser.parse_args() + + if args.algo != "PPO": + raise ValueError( + "This example script only runs with PPO! Set --algo=PPO on the command " + "line." + ) + + base_config = ( + PPOConfig() + .environment(CorrelatedActionsEnv) + .training( + train_batch_size_per_learner=2000, + num_epochs=12, + minibatch_size=256, + entropy_coeff=0.005, + lr=0.0003, + ) + # Specify the RLModule class to be used. + .rl_module( + rl_module_spec=RLModuleSpec(module_class=AutoregressiveActionsRLM), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py new file mode 100644 index 0000000000000000000000000000000000000000..5d28b3f622fdd5753f1f5ab3ef669faedd9db962 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py @@ -0,0 +1,86 @@ +from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete + +from ray.tune.registry import register_env +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.examples.envs.classes.multi_agent import ( + MultiAgentNestedSpaceRepeatAfterMeEnv, +) +from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import ( + NestedSpaceRepeatAfterMeEnv, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + + +# Read in common example script command line arguments. +parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0) +parser.set_defaults(enable_new_api_stack=True) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Define env-to-module-connector pipeline for the new stack. + def _env_to_module_pipeline(env): + return FlattenObservations(multi_agent=args.num_agents > 0) + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv( + config=dict(c, **{"num_agents": args.num_agents}) + ), + ) + else: + register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c)) + + # Define the AlgorithmConfig used. + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + "env", + env_config={ + "space": Dict( + { + "a": Tuple( + [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})] + ), + "b": Box(-10.0, 10.0, (2,)), + "c": MultiDiscrete([3, 3]), + "d": Discrete(2), + } + ), + "episode_len": 100, + }, + ) + .env_runners(env_to_module_connector=_env_to_module_pipeline) + # No history in Env (bandit problem). + .training( + gamma=0.0, + lr=0.0005, + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Fix some PPO-specific settings. + if args.algo == "PPO": + base_config.training( + # We don't want high entropy in this Env. + entropy_coeff=0.00005, + num_epochs=4, + vf_loss_coeff=0.01, + ) + + # Run everything as configured. + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8da4317de4a24a77d9bab9d823d646824c56f58e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1828ede5a40d36666bccd88d100d04fd43dc7880 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18274a8c64598fb13b5ea717fe320175d67a2cbd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc418a1fe230ca2b8c2615f4bf2f08257a421920 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbb39532e2a82c212a1880865a84773e737429cd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea038ef644619c93d8169f5aee66c103e33b5dab Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0997013693fead49f007582cd037cffd9cef1ca9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d211228568beee9d82c837c3dfd4f03ef5d4c74 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..907fc442cd515432fffb4a963db5b4211fcf97a3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b86bc505ff3f2e556fa588b872c6aa578ee3286 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae7f158afddd5b30e6ac2445575a1b18aa2e1704 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py new file mode 100644 index 0000000000000000000000000000000000000000..1f865e3a8ae8f0b91161ca41449b3aa97e98776d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py @@ -0,0 +1,92 @@ +from collections import Counter +from typing import Any, List, Optional + +import gymnasium as gym + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.typing import EpisodeType + + +class CountBasedCuriosity(ConnectorV2): + """Learner ConnectorV2 piece to compute intrinsic rewards based on obs counts. + + Add this connector piece to your Learner pipeline, through your algo config: + ``` + config.training( + learner_connector=lambda obs_sp, act_sp: CountBasedCuriosity() + ) + ``` + + Intrinsic rewards are computed on the Learner side based on naive observation + counts, which is why this connector should only be used for simple environments + with a reasonable number of possible observations. The intrinsic reward for a given + timestep is: + r(i) = intrinsic_reward_coeff * (1 / C(obs(i))) + where C is the total (lifetime) count of the obs at timestep i. + + The intrinsic reward is added to the extrinsic reward and saved back into the + episode (under the main "rewards" key). + + Note that the computation and saving back to the episode all happens before the + actual train batch is generated from the episode data. Thus, the Learner and the + RLModule used do not take notice of the extra reward added. + + If you would like to use a more sophisticated mechanism for intrinsic reward + computations, take a look at the `EuclidianDistanceBasedCuriosity` connector piece + at `ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity` + """ + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + intrinsic_reward_coeff: float = 1.0, + **kwargs, + ): + """Initializes a CountBasedCuriosity instance. + + Args: + intrinsic_reward_coeff: The weight with which to multiply the intrinsic + reward before adding (and saving) it back to the main (extrinsic) + reward of the episode at each timestep. + """ + super().__init__(input_observation_space, input_action_space) + + # Naive observation counter. + self._counts = Counter() + self.intrinsic_reward_coeff = intrinsic_reward_coeff + + def __call__( + self, + *, + rl_module: RLModule, + batch: Any, + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # Loop through all episodes and change the reward to + # [reward + intrinsic reward] + for sa_episode in self.single_agent_episode_iterator( + episodes=episodes, agents_that_stepped_only=False + ): + # Loop through all obs, except the last one. + observations = sa_episode.get_observations(slice(None, -1)) + # Get all respective (extrinsic) rewards. + rewards = sa_episode.get_rewards() + + for i, (obs, rew) in enumerate(zip(observations, rewards)): + obs = tuple(obs) + # Add 1 to obs counter. + self._counts[obs] += 1 + # Compute our count-based intrinsic reward and add it to the main + # (extrinsic) reward. + rew += self.intrinsic_reward_coeff * (1 / self._counts[obs]) + # Store the new reward back to the episode (under the correct + # timestep/index). + sa_episode.set_rewards(new_data=rew, at_indices=i) + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py new file mode 100644 index 0000000000000000000000000000000000000000..c50a2caae5d744e40a95f1c78608ca0f99050398 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py @@ -0,0 +1,122 @@ +from collections import deque +from typing import Any, List, Optional + +import gymnasium as gym +import numpy as np + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.typing import EpisodeType + + +class EuclidianDistanceBasedCuriosity(ConnectorV2): + """Learner ConnectorV2 piece computing intrinsic rewards with euclidian distance. + + Add this connector piece to your Learner pipeline, through your algo config: + ``` + config.training( + learner_connector=lambda obs_sp, act_sp: EuclidianDistanceBasedCuriosity() + ) + ``` + + Intrinsic rewards are computed on the Learner side based on comparing the euclidian + distance of observations vs already seen ones. A configurable number of observations + will be stored in a FIFO buffer and all incoming observations have their distance + measured against those. + + The minimum distance measured is the intrinsic reward for the incoming obs + (multiplied by a fixed coeffieicnt and added to the "main" extrinsic reward): + r(i) = intrinsic_reward_coeff * min(ED(o, o(i)) for o in stored_obs)) + where `ED` is the euclidian distance and `stored_obs` is the buffer. + + The intrinsic reward is then added to the extrinsic reward and saved back into the + episode (under the main "rewards" key). + + Note that the computation and saving back to the episode all happens before the + actual train batch is generated from the episode data. Thus, the Learner and the + RLModule used do not take notice of the extra reward added. + + Only one observation per incoming episode will be stored as a new one in the buffer. + Thereby, we pick the observation with the largest `min(ED)` value over all already + stored observations to be stored per episode. + + If you would like to use a simpler, count-based mechanism for intrinsic reward + computations, take a look at the `CountBasedCuriosity` connector piece + at `ray.rllib.examples.connectors.classes.count_based_curiosity` + """ + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + intrinsic_reward_coeff: float = 1.0, + max_buffer_size: int = 100, + **kwargs, + ): + """Initializes a CountBasedCuriosity instance. + + Args: + intrinsic_reward_coeff: The weight with which to multiply the intrinsic + reward before adding (and saving) it back to the main (extrinsic) + reward of the episode at each timestep. + """ + super().__init__(input_observation_space, input_action_space) + + # Create an observation buffer + self.obs_buffer = deque(maxlen=max_buffer_size) + self.intrinsic_reward_coeff = intrinsic_reward_coeff + + self._test = 0 + + def __call__( + self, + *, + rl_module: RLModule, + batch: Any, + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + if self._test > 10: + return batch + self._test += 1 + # Loop through all episodes and change the reward to + # [reward + intrinsic reward] + for sa_episode in self.single_agent_episode_iterator( + episodes=episodes, agents_that_stepped_only=False + ): + # Loop through all obs, except the last one. + observations = sa_episode.get_observations(slice(None, -1)) + # Get all respective (extrinsic) rewards. + rewards = sa_episode.get_rewards() + + max_dist_obs = None + max_dist = float("-inf") + for i, (obs, rew) in enumerate(zip(observations, rewards)): + # Compare obs to all stored observations and compute euclidian distance. + min_dist = 0.0 + if self.obs_buffer: + min_dist = min( + np.sqrt(np.sum((obs - stored_obs) ** 2)) + for stored_obs in self.obs_buffer + ) + if min_dist > max_dist: + max_dist = min_dist + max_dist_obs = obs + + # Compute our euclidian distance-based intrinsic reward and add it to + # the main (extrinsic) reward. + rew += self.intrinsic_reward_coeff * min_dist + # Store the new reward back to the episode (under the correct + # timestep/index). + sa_episode.set_rewards(new_data=rew, at_indices=i) + + # Add the one observation of this episode with the largest (min) euclidian + # dist to all already stored obs to the buffer (maybe throwing out the + # oldest obs in there). + if max_dist_obs is not None: + self.obs_buffer.append(max_dist_obs) + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed4a891afcd4e64cb50148446ecca5cc1e1e06c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py @@ -0,0 +1,80 @@ +from typing import Any, List, Optional + +import gymnasium as gym +import numpy as np + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.examples.envs.classes.utils.cartpole_observations_proto import ( + CartPoleObservation, +) +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType + + +class ProtobufCartPoleObservationDecoder(ConnectorV2): + """Env-to-module ConnectorV2 piece decoding protobuf obs into CartPole-v1 obs. + + Add this connector piece to your env-to-module pipeline, through your algo config: + ``` + config.env_runners( + env_to_module_connector=lambda env: ProtobufCartPoleObservationDecoder() + ) + ``` + + The incoming observation space must be a 1D Box of dtype uint8 + (which is the same as a binary string). The outgoing observation space is the + normal CartPole-v1 1D space: Box(-inf, inf, (4,), float32). + """ + + @override(ConnectorV2) + def recompute_output_observation_space( + self, + input_observation_space: gym.Space, + input_action_space: gym.Space, + ) -> gym.Space: + # Make sure the incoming observation space is a protobuf (binary string). + assert ( + isinstance(input_observation_space, gym.spaces.Box) + and len(input_observation_space.shape) == 1 + and input_observation_space.dtype.name == "uint8" + ) + # Return CartPole-v1's natural observation space. + return gym.spaces.Box(float("-inf"), float("inf"), (4,), np.float32) + + def __call__( + self, + *, + rl_module: RLModule, + batch: Any, + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # Loop through all episodes and change the observation from a binary string + # to an actual 1D np.ndarray (normal CartPole-v1 obs). + for sa_episode in self.single_agent_episode_iterator(episodes=episodes): + # Get last obs (binary string). + obs = sa_episode.get_observations(-1) + obs_bytes = obs.tobytes() + obs_protobuf = CartPoleObservation() + obs_protobuf.ParseFromString(obs_bytes) + + # Set up the natural CartPole-v1 observation tensor from the protobuf + # values. + new_obs = np.array( + [ + obs_protobuf.x_pos, + obs_protobuf.x_veloc, + obs_protobuf.angle_pos, + obs_protobuf.angle_veloc, + ], + np.float32, + ) + + # Write the new observation (1D tensor) back into the Episode. + sa_episode.set_observations(new_data=new_obs, at_indices=-1) + + # Return `data` as-is. + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py new file mode 100644 index 0000000000000000000000000000000000000000..ad09e4ceb6bf2665d0766ebe218d071ca2ada19f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py @@ -0,0 +1,14 @@ +"""Placeholder for training with count-based curiosity. + +The actual script can be found at a different location (see code below). +""" + +if __name__ == "__main__": + import subprocess + import sys + + # Forward to "python ../curiosity/[same script name].py [same options]" + command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:] + + # Run the script. + subprocess.run(command, capture_output=True) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py new file mode 100644 index 0000000000000000000000000000000000000000..6e52de76791304545eebcf1aea76fe93cb2c6f39 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py @@ -0,0 +1,14 @@ +"""Placeholder for training with euclidian distance-based curiosity. + +The actual script can be found at a different location (see code below). +""" + +if __name__ == "__main__": + import subprocess + import sys + + # Forward to "python ../curiosity/[same script name].py [same options]" + command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:] + + # Run the script. + subprocess.run(command, capture_output=True) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py new file mode 100644 index 0000000000000000000000000000000000000000..564df75c6b9d76cd86e260556609140a6adf47d0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py @@ -0,0 +1,154 @@ +"""Example using a ConnectorV2 to flatten arbitrarily nested dict or tuple observations. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FlattenObservation` ConnectorV2 piece can be added to the + env-to-module pipeline. + - demonstrates that by using this connector, any arbitrarily nested dict or tuple + observations is properly flattened into a simple 1D tensor, for easier RLModule + processing. + - shows how - in a multi-agent setup - individual agents can be specified, whose + observations should be flattened (while other agents' observations will always + be left as-is). + - uses a variant of the CartPole-v1 environment, in which the 4 observation items + (x-pos, x-veloc, angle, and angle-veloc) are taken apart and put into a nested dict + with the structure: + { + "x-pos": [x-pos], + "angular-pos": { + "value": [angle], + "some_random_stuff": [random Discrete(3)], # <- should be ignored by algo + }, + "velocs": Tuple([x-veloc], [angle-veloc]), + } + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + ++---------------------+------------+----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+------------+----------------+--------+------------------+ +| PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 | 25 | 24.1426 | ++---------------------+------------+----------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| num_env_steps_sample | num_env_steps_traine | episode_return_mean | +| d_lifetime | d_lifetime | | ++------------------------+------------------------+------------------------| +| 100000 | 100000 | 421.42 | ++------------------------+------------------------+------------------------+ +""" +from ray.tune.registry import register_env +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import ( + CartPoleWithDictObservationSpace, +) +from ray.rllib.examples.envs.classes.multi_agent import ( + MultiAgentCartPoleWithDictObservationSpace, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + + +# Read in common example script command line arguments. +parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0) +parser.set_defaults(enable_new_api_stack=True) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Define env-to-module-connector pipeline for the new stack. + def _env_to_module_pipeline(env): + return FlattenObservations(multi_agent=args.num_agents > 0) + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda _: MultiAgentCartPoleWithDictObservationSpace( + config={"num_agents": args.num_agents} + ), + ) + else: + register_env("env", lambda _: CartPoleWithDictObservationSpace()) + + # Define the AlgorithmConfig used. + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env") + .env_runners(env_to_module_connector=_env_to_module_pipeline) + .training( + gamma=0.99, + lr=0.0003, + ) + .rl_module( + model_config=DefaultModelConfig( + fcnet_hiddens=[32], + fcnet_activation="linear", + vf_share_layers=True, + ), + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # PPO-specific settings (for better learning behavior only). + if args.algo == "PPO": + base_config.training( + num_epochs=6, + vf_loss_coeff=0.01, + ) + # IMPALA-specific settings (for better learning behavior only). + elif args.algo == "IMPALA": + base_config.training( + lr=0.0005, + vf_loss_coeff=0.05, + entropy_coeff=0.0, + ) + + # Run everything as configured. + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py new file mode 100644 index 0000000000000000000000000000000000000000..a22868c374cfa6cca835c0efd636f38d21f85c87 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py @@ -0,0 +1,228 @@ +"""Example using 2 ConnectorV2 for observation frame-stacking in Atari environments. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the + env-to-module pipeline. + - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the + learner connector pipeline. + - demonstrates that using these two pieces (rather than performing framestacking + already inside the environment using a gymnasium wrapper) increases overall + performance by about 5%. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module +and learner connector pipelines), you should see something like this using: +`--env ALE/Pong-v5 --num-learners=4 --num-gpus-per-learner=1 --num-env-runners=95` ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 335.837 | ++---------------------------+------------+--------+------------------+... + +Note that the time to run these 200 iterations is about ~5% faster than when +performing framestacking already inside the environment (using a +`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic +needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). + +Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal), +the output looks like this: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 351.505 | ++---------------------------+------------+--------+------------------+... +""" +import gymnasium as gym + +from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule +from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +# Read in common example script command line arguments. +parser = add_rllib_example_script_args( + default_timesteps=5000000, default_reward=20.0, default_iters=200 +) +# Use Pong by default. +parser.set_defaults( + enable_new_api_stack=True, + env="ale_py:ALE/Pong-v5", +) +parser.add_argument( + "--num-frames", + type=int, + default=4, + help="The number of observation frames to stack.", +) +parser.add_argument( + "--use-gym-wrapper-framestacking", + action="store_true", + help="Whether to use RLlib's Atari wrapper's framestacking capabilities (as " + "opposed to doing it via a specific ConenctorV2 pipeline).", +) + + +if __name__ == "__main__": + from ray import tune + + args = parser.parse_args() + + # Define our custom connector pipelines. + def _make_env_to_module_connector(env): + # Create the env-to-module connector. We return an individual connector piece + # here, which RLlib automatically integrates into a pipeline (and + # add its default connector piece to the end of that pipeline). + # The default pipeline automatically fixes the input- and output spaces of the + # individual connector pieces in it. + # Note that since the frame stacking connector does NOT write information + # back to the episode (in order to save memory and network traffic), we + # also need to perform the same procedure on the Learner end (see below + # where we set up the Learner pipeline). + return FrameStackingEnvToModule( + num_frames=args.num_frames, + multi_agent=args.num_agents > 0, + ) + + def _make_learner_connector(input_observation_space, input_action_space): + # Create the learner connector. + return FrameStackingLearner( + num_frames=args.num_frames, + multi_agent=args.num_agents > 0, + ) + + # Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it). + # We would like our frame stacking connector to do this job. + def _env_creator(cfg): + return wrap_atari_for_new_api_stack( + gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}), + # Perform framestacking either through ConnectorV2 or right here through + # the observation wrapper. + framestack=( + args.num_frames if args.use_gym_wrapper_framestacking else None + ), + ) + + if args.num_agents > 0: + tune.register_env( + "atari-env", + lambda cfg: make_multi_agent(_env_creator)( + dict(cfg, **{"num_agents": args.num_agents}) + ), + ) + else: + tune.register_env("atari-env", _env_creator) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + "atari-env", + env_config={ + # Make analogous to old v4 + NoFrameskip. + "frameskip": 1, + "full_action_space": False, + "repeat_action_probability": 0.0, + }, + clip_rewards=True, + ) + .env_runners( + # ... new EnvRunner and our frame stacking env-to-module connector. + env_to_module_connector=( + None + if args.use_gym_wrapper_framestacking + else _make_env_to_module_connector + ), + num_envs_per_env_runner=1 if args.num_agents > 0 else 2, + ) + .training( + # Use our frame stacking learner connector. + learner_connector=( + None if args.use_gym_wrapper_framestacking else _make_learner_connector + ), + entropy_coeff=0.01, + # Linearly adjust learning rate based on number of GPUs. + lr=0.00015 * (args.num_learners or 1), + grad_clip=100.0, + grad_clip_by="global_norm", + ) + .rl_module( + model_config=DefaultModelConfig( + vf_share_layers=True, + conv_filters=[(16, 4, 2), (32, 4, 2), (64, 4, 2), (128, 4, 2)], + conv_activation="relu", + head_fcnet_hiddens=[256], + ), + ) + ) + + # PPO specific settings. + if args.algo == "PPO": + base_config.training( + num_epochs=10, + minibatch_size=64, + lambda_=0.95, + kl_coeff=0.5, + clip_param=0.1, + vf_clip_param=10.0, + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Run everything as configured. + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py new file mode 100644 index 0000000000000000000000000000000000000000..aaccbf02cddbbd88eb3b341edcfc3d0777744b09 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py @@ -0,0 +1,198 @@ +"""Example using a ConnectorV2 for processing observations with a mean/std filter. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `MeanStdFilter` ConnectorV2 piece can be added to the env-to-module + pipeline. + - demonstrates that using such a filter enhances learning behavior (or even makes + if possible to learn overall) in some environments, especially those with lopsided + observation spaces, for example `Box(-3000, -1000, ...)`. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +Running this example with the mean-std filter results in the normally expected Pendulum +learning behavior: ++-------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-------------------------------+------------+-----------------+--------+ +| PPO_lopsided-pend_f9c96_00000 | TERMINATED | 127.0.0.1:43612 | 77 | ++-------------------------------+------------+-----------------+--------+ ++------------------+------------------------+-----------------------+ +| total time (s) | num_env_steps_sample | episode_return_mean | +| | d_lifetime | | +|------------------+------------------------+-----------------------| +| 30.7466 | 40040 | -276.3 | ++------------------+------------------------+-----------------------+ + +If you try using the `--disable-mean-std-filter` (all other things being equal), you +will either see no learning progress at all (or a very slow one), but more likely some +numerical instability related error will be thrown: + +ValueError: Expected parameter loc (Tensor of shape (64, 1)) of distribution + Normal(loc: torch.Size([64, 1]), scale: torch.Size([64, 1])) to satisfy the + constraint Real(), but found invalid values: +tensor([[nan], + [nan], + [nan], + ... +""" +import gymnasium as gym +import numpy as np + +from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +torch, _ = try_import_torch() + +parser = add_rllib_example_script_args( + default_iters=500, + default_timesteps=500000, + default_reward=-300.0, +) +parser.add_argument( + "--disable-mean-std-filter", + action="store_true", + help="Run w/o a mean/std env-to-module connector piece (filter).", +) + + +class LopsidedObs(gym.ObservationWrapper): + def __init__(self, env): + super().__init__(env) + self.observation_space = gym.spaces.Box(-4000.0, -1456.0, (3,), np.float32) + + def observation(self, observation): + # Lopside [-1.0, 1.0] Pendulum observations + return ((observation + 1.0) / 2.0) * (4000.0 - 1456.0) - 4000.0 + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "lopsided-pend", + lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}), + ) + else: + register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1"))) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("lopsided-pend") + .env_runners( + # TODO (sven): MAEnvRunner does not support vectorized envs yet + # due to gym's env checkers and non-compatability with RLlib's + # MultiAgentEnv API. + num_envs_per_env_runner=1 if args.num_agents > 0 else 20, + # Define a single connector piece to be prepended to the env-to-module + # connector pipeline. + # Alternatively, return a list of n ConnectorV2 pieces (which will then be + # included in an automatically generated EnvToModulePipeline or return a + # EnvToModulePipeline directly. + env_to_module_connector=( + None + if args.disable_mean_std_filter + else lambda env: MeanStdFilter(multi_agent=args.num_agents > 0) + ), + ) + .training( + train_batch_size_per_learner=512, + gamma=0.95, + # Linearly adjust learning rate based on number of GPUs. + lr=0.0003 * (args.num_learners or 1), + vf_loss_coeff=0.01, + ) + .rl_module( + model_config=DefaultModelConfig( + fcnet_activation="relu", + fcnet_kernel_initializer=torch.nn.init.xavier_uniform_, + fcnet_bias_initializer=torch.nn.init.constant_, + fcnet_bias_initializer_kwargs={"val": 0.0}, + ), + ) + # In case you would like to run with a evaluation EnvRunners, make sure your + # `evaluation_config` key contains the `use_worker_filter_stats=False` setting + # (see below). This setting makes sure that the mean/std stats collected by the + # evaluation EnvRunners are NOT used for the training EnvRunners (unless you + # really want to mix these stats). It's normally a good idea to keep the stats + # collected during evaluation completely out of the training data (already for + # better reproducibility alone). + # .evaluation( + # evaluation_num_env_runners=1, + # evaluation_interval=1, + # evaluation_config={ + # "explore": False, + # # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before + # # each round of evaluation, broadcast the latest training + # # EnvRunnerGroup's ConnectorV2 states (merged from all training remote + # # EnvRunners) to the eval EnvRunnerGroup (and discard the eval + # # EnvRunners' stats). + # "use_worker_filter_stats": False, + # }, + # ) + ) + + # PPO specific settings. + if args.algo == "PPO": + base_config.training( + minibatch_size=64, + lambda_=0.1, + vf_clip_param=10.0, + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py new file mode 100644 index 0000000000000000000000000000000000000000..1fa1e6681b90dfcad83b3f765599d8dd4f724ace --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -0,0 +1,164 @@ +"""Example using a ConnectorV2 to add previous rewards/actions to an RLModule's input. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `PrevActionsPrevRewards` ConnectorV2 piece can be added to the + env-to-module pipeline to extract previous rewards and/or actions from the ongoing + episodes. + - shows how this connector creates and wraps this new information (rewards and + actions) together with the original observations into the RLModule's input dict + under a new `gym.spaces.Dict` structure (for example, if your observation space + is `O=Box(shape=(3,))` and you add the most recent 1 reward, the new observation + space will be `Dict({"_original_obs": O, "prev_n_rewards": Box(shape=())})`. + - demonstrates how to use RLlib's `FlattenObservations` right after the + `PrevActionsPrevRewards` to flatten that new dict observation structure again into + a single 1D tensor. + - uses the StatelessCartPole environment, a CartPole-v1 derivative that's missing + both x-veloc and angle-veloc observation components and is therefore non-Markovian + (only partially observable). An LSTM default model is used for training. Adding + the additional context to the observations (for example, prev. actions) helps the + LSTM to more quickly learn in this environment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +You should see something similar to this in your terminal output when running +ths script as described above: + ++---------------------+------------+-----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+------------+-----------------+--------+------------------+ +| PPO_env_0edd2_00000 | TERMINATED | 127.0.0.1:12632 | 17 | 42.6898 | ++---------------------+------------+-----------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| num_env_steps_sample | num_env_steps_traine | episode_return_mean | +| d_lifetime | d_lifetime | | +|------------------------+------------------------+------------------------| +| 68000 | 68000 | 205.22 | ++------------------------+------------------------+------------------------+ +""" +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.connectors.env_to_module import ( + FlattenObservations, + PrevActionsPrevRewards, +) +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune import register_env + +torch, nn = try_import_torch() + + +parser = add_rllib_example_script_args( + default_reward=200.0, default_timesteps=1000000, default_iters=2000 +) +parser.set_defaults(enable_new_api_stack=True) +parser.add_argument("--n-prev-rewards", type=int, default=1) +parser.add_argument("--n-prev-actions", type=int, default=1) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Define our custom connector pipelines. + def _env_to_module(env): + # Create the env-to-module connector pipeline. + return [ + PrevActionsPrevRewards( + multi_agent=args.num_agents > 0, + n_prev_rewards=args.n_prev_rewards, + n_prev_actions=args.n_prev_actions, + ), + FlattenObservations(multi_agent=args.num_agents > 0), + ] + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda _: MultiAgentStatelessCartPole( + config={"num_agents": args.num_agents} + ), + ) + else: + register_env("env", lambda _: StatelessCartPole()) + + config = ( + PPOConfig() + .environment("env") + .env_runners(env_to_module_connector=_env_to_module) + .training( + num_epochs=6, + lr=0.0003, + train_batch_size=4000, + vf_loss_coeff=0.01, + ) + .rl_module( + model_config=DefaultModelConfig( + use_lstm=True, + max_seq_len=20, + fcnet_hiddens=[32], + fcnet_activation="linear", + fcnet_kernel_initializer=nn.init.xavier_uniform_, + fcnet_bias_initializer=nn.init.constant_, + fcnet_bias_initializer_kwargs={"val": 0.0}, + vf_share_layers=True, + ), + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + config = config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + run_rllib_example_script_experiment(config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..9a2a796a08a3778d56c4b4dad7a9b08154abc896 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py @@ -0,0 +1,230 @@ +"""Example of customizing the evaluation procedure for an RLlib Algorithm. + +Note, that you should only choose to provide a custom eval function, in case the already +built-in eval options are not sufficient. Normally, though, RLlib's eval utilities +that come with each Algorithm are enough to properly evaluate the learning progress +of your Algorithm. + +This script uses the SimpleCorridor environment, a simple 1D gridworld, in which +the agent can only walk left (action=0) or right (action=1). The goal state is located +at the end of the (1D) corridor. The env exposes an API to change the length of the +corridor on-the-fly. We use this API here to extend the size of the corridor for the +evaluation runs. + +For demonstration purposes only, we define a simple custom evaluation method that does +the following: +- It changes the corridor length of all environments used on the evaluation EnvRunners. +- It runs a defined number of episodes for evaluation purposes. +- It collects the metrics from those runs, summarizes these metrics and returns them. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack + +You can switch off custom evaluation (and use RLlib's default evaluation procedure) +with the `--no-custom-eval` flag. + +You can switch on parallel evaluation to training using the +`--evaluation-parallel-to-training` flag. See this example script here: +https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py # noqa +for more details on running evaluation parallel to training. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see the following (or very similar) console output when running this script. +Note that for each iteration, due to the definition of our custom evaluation function, +we run 3 evaluation rounds per single training round. + +... +Training iteration 1 -> evaluation round 0 +Training iteration 1 -> evaluation round 1 +Training iteration 1 -> evaluation round 2 +... +... ++--------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|--------------------------------+------------+-----------------+--------+ +| PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 | 4 | ++--------------------------------+------------+-----------------+--------+ ++------------------+-------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+-------+----------+--------------------| +| 26.1973 | 16000 | 0.872034 | 13.7966 | ++------------------+-------+----------+--------------------+ +""" +from typing import Tuple + +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.env.env_runner_group import EnvRunnerGroup +from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EVALUATION_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.rllib.utils.typing import ResultDict +from ray.tune.registry import get_trainable_cls + + +parser = add_rllib_example_script_args( + default_iters=50, default_reward=0.7, default_timesteps=50000 +) +parser.add_argument("--no-custom-eval", action="store_true") +parser.add_argument("--corridor-length-training", type=int, default=10) +parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20) +parser.add_argument("--corridor-length-eval-worker-2", type=int, default=30) + + +def custom_eval_function( + algorithm: Algorithm, + eval_workers: EnvRunnerGroup, +) -> Tuple[ResultDict, int, int]: + """Example of a custom evaluation function. + + Args: + algorithm: Algorithm class to evaluate. + eval_workers: Evaluation EnvRunnerGroup. + + Returns: + metrics: Evaluation metrics dict. + """ + # Set different env settings for each (eval) EnvRunner. Here we use the EnvRunner's + # `worker_index` property to figure out the actual length. + # Loop through all workers and all sub-envs (gym.Env) on each worker and call the + # `set_corridor_length` method on these. + eval_workers.foreach_env_runner( + func=lambda worker: ( + env.unwrapped.set_corridor_length( + args.corridor_length_eval_worker_1 + if worker.worker_index == 1 + else args.corridor_length_eval_worker_2 + ) + for env in worker.env.unwrapped.envs + ) + ) + + # Collect metrics results collected by eval workers in this list for later + # processing. + env_runner_metrics = [] + sampled_episodes = [] + # For demonstration purposes, run through some number of evaluation + # rounds within this one call. Note that this function is called once per + # training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()` + # (which can be called manually by the user). + for i in range(3): + print(f"Training iteration {algorithm.iteration} -> evaluation round {i}") + # Sample episodes from the EnvRunners AND have them return only the thus + # collected metrics. + episodes_and_metrics_all_env_runners = eval_workers.foreach_env_runner( + # Return only the metrics, NOT the sampled episodes (we don't need them + # anymore). + func=lambda worker: (worker.sample(), worker.get_metrics()), + local_env_runner=False, + ) + sampled_episodes.extend( + eps + for eps_and_mtrcs in episodes_and_metrics_all_env_runners + for eps in eps_and_mtrcs[0] + ) + env_runner_metrics.extend( + eps_and_mtrcs[1] for eps_and_mtrcs in episodes_and_metrics_all_env_runners + ) + + # You can compute metrics from the episodes manually, or use the Algorithm's + # convenient MetricsLogger to store all evaluation metrics inside the main + # algo. + algorithm.metrics.merge_and_log_n_dicts( + env_runner_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS) + ) + eval_results = algorithm.metrics.reduce( + key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS) + ) + # Alternatively, you could manually reduce over the n returned `env_runner_metrics` + # dicts, but this would be much harder as you might not know, which metrics + # to sum up, which ones to average over, etc.. + + # Compute env and agent steps from sampled episodes. + env_steps = sum(eps.env_steps() for eps in sampled_episodes) + agent_steps = sum(eps.agent_steps() for eps in sampled_episodes) + + return eval_results, env_steps, agent_steps + + +if __name__ == "__main__": + args = parser.parse_args() + args.local_mode = True + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + # For training, we use a corridor length of n. For evaluation, we use different + # values, depending on the eval worker index (1 or 2). + .environment( + SimpleCorridor, + env_config={"corridor_length": args.corridor_length_training}, + ) + .evaluation( + # Do we use the custom eval function defined above? + custom_evaluation_function=( + None if args.no_custom_eval else custom_eval_function + ), + # Number of eval EnvRunners to use. + evaluation_num_env_runners=2, + # Enable evaluation, once per training iteration. + evaluation_interval=1, + # Run 10 episodes each time evaluation runs (OR "auto" if parallel to + # training). + evaluation_duration="auto" if args.evaluation_parallel_to_training else 10, + # Evaluate parallelly to training? + evaluation_parallel_to_training=args.evaluation_parallel_to_training, + # Override the env settings for the eval workers. + # Note, though, that this setting here is only used in case --no-custom-eval + # is set, b/c in case the custom eval function IS used, we override the + # length of the eval environments in that custom function, so this setting + # here is simply ignored. + evaluation_config=AlgorithmConfig.overrides( + env_config={"corridor_length": args.corridor_length_training * 2}, + # TODO (sven): Add support for window=float(inf) and reduce=mean for + # evaluation episode_return_mean reductions (identical to old stack + # behavior, which does NOT use a window (100 by default) to reduce + # eval episode returns. + metrics_num_episodes_for_smoothing=5, + ), + ) + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": ( + args.stop_reward + ), + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + } + + run_rllib_example_script_experiment( + base_config, + args, + stop=stop, + success_metric={ + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": ( + args.stop_reward + ), + }, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3ecd849d295ce6d521ffed1c6a34115db5482fa Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ac5ef08b4170bba94ddbd71f82769db5f4d6eff Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5edfe9a837e94e261c7219abcb5fa7b6fdcde920 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a16ea8af2cc4163611f43ad2901ee84f37fec229 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffb38152ec6aaae1a0a6955f68f10c95239f2513 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9a3b7a817aad215f2851619ed0157d2b794c16e2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py @@ -0,0 +1,250 @@ +"""Example of using float16 precision for training and inference. + +This example: + - shows how to write a custom callback for RLlib to convert all RLModules + (on the EnvRunners and Learners) to float16 precision. + - shows how to write a custom env-to-module ConnectorV2 piece to convert all + observations and rewards in the collected trajectories to float16 (numpy) arrays. + - shows how to write a custom grad scaler for torch that is necessary to stabilize + learning with float16 weight matrices and gradients. This custom scaler behaves + exactly like the torch built-in `torch.amp.GradScaler` but also works for float16 + gradients (which the torch built-in one doesn't). + - shows how to write a custom TorchLearner to change the epsilon setting (to the + much larger 1e-4 to stabilize learning) on the default optimizer (Adam) registered + for each RLModule. + - demonstrates how to plug in all the above custom components into an + `AlgorithmConfig` instance and start training (and inference) with float16 + precision. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. + + +Results to expect +----------------- +You should see something similar to the following on your terminal, when running this +script with the above recommended options: + ++-----------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-----------------------------+------------+-----------------+--------+ +| PPO_CartPole-v1_437ee_00000 | TERMINATED | 127.0.0.1:81045 | 6 | ++-----------------------------+------------+-----------------+--------+ ++------------------+------------------------+------------------------+ +| total time (s) | episode_return_mean | num_episodes_lifetime | +| | | | +|------------------+------------------------+------------------------+ +| 71.3123 | 153.79 | 358 | ++------------------+------------------------+------------------------+ +""" +import gymnasium as gym +import numpy as np +import torch + +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.utils.annotations import override +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_iters=50, default_reward=150.0, default_timesteps=100000 +) +parser.set_defaults( + enable_new_api_stack=True, +) + + +def on_algorithm_init( + algorithm: Algorithm, + **kwargs, +) -> None: + """Callback making sure that all RLModules in the algo are `half()`'ed.""" + + # Switch all Learner RLModules to float16. + algorithm.learner_group.foreach_learner( + lambda learner: learner.module.foreach_module(lambda mid, mod: mod.half()) + ) + # Switch all EnvRunner RLModules (assuming single RLModules) to float16. + algorithm.env_runner_group.foreach_env_runner( + lambda env_runner: env_runner.module.half() + ) + if algorithm.eval_env_runner_group: + algorithm.eval_env_runner_group.foreach_env_runner( + lambda env_runner: env_runner.module.half() + ) + + +class WriteObsAndRewardsAsFloat16(ConnectorV2): + """ConnectorV2 piece preprocessing observations and rewards to be float16. + + Note that users can also write a gymnasium.Wrapper for observations and rewards + to achieve the same thing. + """ + + def recompute_output_observation_space( + self, + input_observation_space, + input_action_space, + ): + return gym.spaces.Box( + input_observation_space.low.astype(np.float16), + input_observation_space.high.astype(np.float16), + input_observation_space.shape, + np.float16, + ) + + def __call__(self, *, rl_module, batch, episodes, **kwargs): + for sa_episode in self.single_agent_episode_iterator(episodes): + obs = sa_episode.get_observations(-1) + float16_obs = obs.astype(np.float16) + sa_episode.set_observations(new_data=float16_obs, at_indices=-1) + if len(sa_episode) > 0: + rew = sa_episode.get_rewards(-1).astype(np.float16) + sa_episode.set_rewards(new_data=rew, at_indices=-1) + return batch + + +class Float16GradScaler: + """Custom grad scaler for `TorchLearner`. + + This class is utilizing the experimental support for the `TorchLearner`'s support + for loss/gradient scaling (analogous to how a `torch.amp.GradScaler` would work). + + TorchLearner performs the following steps using this class (`scaler`): + - loss_per_module = TorchLearner.compute_losses() + - for L in loss_per_module: L = scaler.scale(L) + - grads = TorchLearner.compute_gradients() # L.backward() on scaled loss + - TorchLearner.apply_gradients(grads): + for optim in optimizers: + scaler.step(optim) # <- grads should get unscaled + scaler.update() # <- update scaling factor + """ + + def __init__( + self, + init_scale=1000.0, + growth_factor=2.0, + backoff_factor=0.5, + growth_interval=2000, + ): + self._scale = init_scale + self.growth_factor = growth_factor + self.backoff_factor = backoff_factor + self.growth_interval = growth_interval + self._found_inf_or_nan = False + self.steps_since_growth = 0 + + def scale(self, loss): + # Scale the loss by `self._scale`. + return loss * self._scale + + def get_scale(self): + return self._scale + + def step(self, optimizer): + # Unscale the gradients for all model parameters and apply. + for group in optimizer.param_groups: + for param in group["params"]: + if param.grad is not None: + param.grad.data.div_(self._scale) + if torch.isinf(param.grad).any() or torch.isnan(param.grad).any(): + self._found_inf_or_nan = True + break + if self._found_inf_or_nan: + break + # Only step if no inf/NaN grad found. + if not self._found_inf_or_nan: + optimizer.step() + + def update(self): + # If gradients are found to be inf/NaN, reduce the scale. + if self._found_inf_or_nan: + self._scale *= self.backoff_factor + self.steps_since_growth = 0 + # Increase the scale after a set number of steps without inf/NaN. + else: + self.steps_since_growth += 1 + if self.steps_since_growth >= self.growth_interval: + self._scale *= self.growth_factor + self.steps_since_growth = 0 + # Reset inf/NaN flag. + self._found_inf_or_nan = False + + +class LargeEpsAdamTorchLearner(PPOTorchLearner): + """A TorchLearner overriding the default optimizer (Adam) to use non-default eps.""" + + @override(TorchLearner) + def configure_optimizers_for_module(self, module_id, config): + """Registers an Adam optimizer with a larg epsilon under the given module_id.""" + params = list(self._module[module_id].parameters()) + + # Register one Adam optimizer (under the default optimizer name: + # DEFAULT_OPTIMIZER) for the `module_id`. + self.register_optimizer( + module_id=module_id, + # Create an Adam optimizer with a different eps for better float16 + # stability. + optimizer=torch.optim.Adam(params, eps=1e-4), + params=params, + # Let RLlib handle the learning rate/learning rate schedule. + # You can leave `lr_or_lr_schedule` at None, but then you should + # pass a fixed learning rate into the Adam constructor above. + lr_or_lr_schedule=config.lr, + ) + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("CartPole-v1") + # Plug in our custom callback (on_algorithm_init) to make all RLModules + # float16 models. + .callbacks(on_algorithm_init=on_algorithm_init) + # Plug in our custom loss scaler class to stabilize gradient computations + # (by scaling the loss, then unscaling the gradients before applying them). + # This is using the built-in, experimental feature of TorchLearner. + .experimental(_torch_grad_scaler_class=Float16GradScaler) + # Plug in our custom env-to-module ConnectorV2 piece to convert all observations + # and reward in the episodes (permanently) to float16. + .env_runners(env_to_module_connector=lambda env: WriteObsAndRewardsAsFloat16()) + .training( + # Plug in our custom TorchLearner (using a much larger, stabilizing epsilon + # on the Adam optimizer). + learner_class=LargeEpsAdamTorchLearner, + # Switch off grad clipping entirely b/c we use our custom grad scaler with + # built-in inf/nan detection (see `step` method of `Float16GradScaler`). + grad_clip=None, + # Typical CartPole-v1 hyperparams known to work well: + gamma=0.99, + lr=0.0003, + num_epochs=6, + vf_loss_coeff=0.01, + use_kl_loss=True, + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py new file mode 100644 index 0000000000000000000000000000000000000000..374a7ec139e966a44911a82909ad97bf87f617bc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py @@ -0,0 +1,119 @@ +"""Example of using fractional GPUs (< 1.0) per Learner worker. + +The number of GPUs required, just for learning (excluding those maybe needed on your +EnvRunners, if applicable) can be computed by: +`num_gpus = config.num_learners * config.num_gpus_per_learner` + +This example: + - shows how to set up an Algorithm that uses one or more Learner workers ... + - ... and how to assign a fractional (< 1.0) number of GPUs to each of these Learners. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-learners= +[number of Learners, e.g. 1] --num-gpus-per-learner [some fraction <1.0]` + +The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU) +machine. +Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4 +learning rates in the `base_config` below: +1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used). +2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used). +3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used). +4) --num-learners=2 --num-gpus-per-learner=1 (8 GPUs used). +5) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an +NCCL-related error due to the fact that torch will try to perform DDP sharding, +but notices that the shards sit on the same GPU). + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +Note that the shown GPU settings in this script also work in case you are not +running via tune, but instead are using the `--no-tune` command line option. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. + + +Results to expect +----------------- +In the console output, you can see that only fractional GPUs are being used by RLlib: + +== Status == +... +Logical resource usage: 12.0/16 CPUs, 1.0/4 GPUs (...) +... +Number of trials: 4/4 (4 RUNNING) + +The final output should look something like this: ++-----------------------------+------------+-----------------+--------+--------+ +| Trial name | status | loc | lr | iter | +| | | | | | +|-----------------------------+------------+-----------------+--------+--------+ +| PPO_CartPole-v1_7104b_00000 | TERMINATED | 10.0.0.39:31197 | 0.005 | 10 | +| PPO_CartPole-v1_7104b_00001 | TERMINATED | 10.0.0.39:31202 | 0.003 | 11 | +| PPO_CartPole-v1_7104b_00002 | TERMINATED | 10.0.0.39:31203 | 0.001 | 10 | +| PPO_CartPole-v1_7104b_00003 | TERMINATED | 10.0.0.39:31204 | 0.0001 | 11 | ++-----------------------------+------------+-----------------+--------+--------+ + ++----------------+----------------------+----------------------+----------------------+ +| total time (s) | num_env_steps_sample | num_env_steps_traine | num_episodes_lifetim | +| | d_lifetime | d_lifetime | e | +|----------------+----------------------+----------------------+----------------------| +| 101.002 | 40000 | 40000 | 346 | +| 110.03 | 44000 | 44000 | 395 | +| 101.171 | 40000 | 40000 | 328 | +| 110.091 | 44000 | 44000 | 478 | ++----------------+----------------------+----------------------+----------------------+ +""" +from ray import tune +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_iters=50, default_reward=180, default_timesteps=100000 +) +parser.set_defaults( + enable_new_api_stack=True, + num_env_runners=2, +) + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + # This script only works on the new API stack. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .environment("CartPole-v1") + # Define EnvRunner scaling. + .env_runners(num_env_runners=args.num_env_runners) + # Define Learner scaling. + .learners( + # How many Learner workers do we need? If you have more than 1 GPU, + # set this parameter to the number of GPUs available. + num_learners=args.num_learners, + # How many GPUs does each Learner need? If you have more than 1 GPU or only + # one Learner, you should set this to 1, otherwise, set this to some + # fraction. + num_gpus_per_learner=args.num_gpus_per_learner, + ) + # 4 tune trials altogether. + .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) + ) + + run_rllib_example_script_experiment(base_config, args, keep_config=True) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py new file mode 100644 index 0000000000000000000000000000000000000000..92a5bd1f53b335943777e320aaafee363068144f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py @@ -0,0 +1,85 @@ +"""Example of using GPUs on the EnvRunners (b/c Env and/or RLModule require these). + +The number of GPUs required, just for your EnvRunners (excluding those needed for +training your RLModule) can be computed by: +`num_gpus = config.num_env_runners * config.num_gpus_per_env_runner` + +This example: + - shows how to write an Env that uses the GPU. + - shows how to configure your algorithm such that it allocates any number of GPUs + (including fractional < 1.0) to each (remote) EnvRunner worker. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-env_runners= +[number of EnvRunners, e.g. 2] --num-gpus-per-env-runner [int or some fraction <1.0]` + +The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU) +machine. +TODO (sven): Fix these +Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4 +learning rates in the `base_config` below: +1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used). +2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used). +3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used). +4) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an +NCCL-related error due to the fact that torch will try to perform DDP sharding, +but notices that the shards sit on the same GPU). + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +Note that the shown GPU settings in this script also work in case you are not +running via tune, but instead are using the `--no-tune` command line option. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. + + +Results to expect +----------------- +In the console output, you can see that only fractional GPUs are being used by RLlib: + +""" +from ray.rllib.examples.envs.classes.gpu_requiring_env import GPURequiringEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_iters=50, default_reward=0.9, default_timesteps=100000 +) +parser.set_defaults( + enable_new_api_stack=True, + num_env_runners=2, +) +parser.add_argument("--num-gpus-per-env-runner", type=float, default=0.5) + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment(GPURequiringEnv) + # Define Learner scaling. + .env_runners( + # How many EnvRunner workers do we need? + num_env_runners=args.num_env_runners, + # How many GPUs does each EnvRunner require? Note that the memory on (a + # possibly fractional GPU) must be enough to accommodate the RLModule AND + # if applicable also the Env's GPU needs). + num_gpus_per_env_runner=args.num_gpus_per_env_runner, + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..c32887146bfd5ca1a3fecc33d3de38cafac8d479 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py @@ -0,0 +1,170 @@ +"""Example of using automatic mixed precision training on a torch RLModule. + +This example: + - shows how to write a custom callback for RLlib to convert those RLModules + only(!) on the EnvRunners to float16 precision. + - shows how to write a custom env-to-module ConnectorV2 piece to add float16 + observations to the action computing forward batch on the EnvRunners, but NOT + permanently write these changes into the episodes, such that on the + Learner side, the original float32 observations will be used (for the mixed + precision `forward_train` and `loss` computations). + - shows how to plugin torch's built-in `GradScaler` class to be used by the + TorchLearner to scale losses and unscale gradients in order to gain more stability + when training with mixed precision. + - shows how to write a custom TorchLearner to run the update step (overrides + `_update()`) within a `torch.amp.autocast()` context. This makes sure that . + - demonstrates how to plug in all the above custom components into an + `AlgorithmConfig` instance and start training with mixed-precision while + performing the inference on the EnvRunners with float16 precision. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +Note that the shown GPU settings in this script also work in case you are not +running via tune, but instead are using the `--no-tune` command line option. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. + + +Results to expect +----------------- +In the console output, you should see something like this: + ++-----------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-----------------------------+------------+-----------------+--------+ +| PPO_CartPole-v1_485af_00000 | TERMINATED | 127.0.0.1:81045 | 22 | ++-----------------------------+------------+-----------------+--------+ ++------------------+------------------------+------------------------+ +| total time (s) | episode_return_mean | num_episodes_lifetime | +| | | | +|------------------+------------------------+------------------------+ +| 281.3231 | 455.81 | 1426 | ++------------------+------------------------+------------------------+ +""" +import gymnasium as gym +import numpy as np +import torch + +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + + +parser = add_rllib_example_script_args( + default_iters=200, default_reward=450.0, default_timesteps=200000 +) +parser.set_defaults( + algo="PPO", + enable_new_api_stack=True, +) + + +def on_algorithm_init( + algorithm: Algorithm, + **kwargs, +) -> None: + """Callback making sure that all RLModules in the algo are `half()`'ed.""" + + # Switch all EnvRunner RLModules (assuming single RLModules) to float16. + algorithm.env_runner_group.foreach_env_runner( + lambda env_runner: env_runner.module.half() + ) + if algorithm.eval_env_runner_group: + algorithm.eval_env_runner_group.foreach_env_runner( + lambda env_runner: env_runner.module.half() + ) + + +class Float16Connector(ConnectorV2): + """ConnectorV2 piece preprocessing observations and rewards to be float16. + + Note that users can also write a gymnasium.Wrapper for observations and rewards + to achieve the same thing. + """ + + def recompute_output_observation_space( + self, + input_observation_space, + input_action_space, + ): + return gym.spaces.Box( + input_observation_space.low.astype(np.float16), + input_observation_space.high.astype(np.float16), + input_observation_space.shape, + np.float16, + ) + + def __call__(self, *, rl_module, batch, episodes, **kwargs): + for sa_episode in self.single_agent_episode_iterator(episodes): + obs = sa_episode.get_observations(-1) + float16_obs = obs.astype(np.float16) + self.add_batch_item( + batch, + column="obs", + item_to_add=float16_obs, + single_agent_episode=sa_episode, + ) + return batch + + +class PPOTorchMixedPrecisionLearner(PPOTorchLearner): + def _update(self, *args, **kwargs): + with torch.cuda.amp.autocast(): + results = super()._update(*args, **kwargs) + return results + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + assert args.algo == "PPO", "Must set --algo=PPO when running this script!" + + base_config = ( + (PPOConfig().environment("CartPole-v1")) + .env_runners(env_to_module_connector=lambda env: Float16Connector()) + # Plug in our custom callback (on_algorithm_init) to make EnvRunner RLModules + # float16 models. + .callbacks(on_algorithm_init=on_algorithm_init) + # Plug in the torch built-int loss scaler class to stabilize gradient + # computations (by scaling the loss, then unscaling the gradients before + # applying them). This is using the built-in, experimental feature of + # TorchLearner. + .experimental(_torch_grad_scaler_class=torch.cuda.amp.GradScaler) + .training( + # Plug in the custom Learner class to activate mixed-precision training for + # our torch RLModule (uses `torch.amp.autocast()`). + learner_class=PPOTorchMixedPrecisionLearner, + # Switch off grad clipping entirely b/c we use our custom grad scaler with + # built-in inf/nan detection (see `step` method of `Float16GradScaler`). + grad_clip=None, + # Typical CartPole-v1 hyperparams known to work well: + gamma=0.99, + lr=0.0003, + num_epochs=6, + vf_loss_coeff=0.01, + use_kl_loss=True, + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce29ca31728d25af476b57b877766726ccafec86 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8464202293a7d3edc35d22b991d8bdbec1e59c5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a161aa3acce08c59e054e0c7a510a7fbff73dcf Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbc33a256bcdbf9eacd28c8f101bda37f68e1906 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/custom_ppo_loss_fn_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/custom_ppo_loss_fn_learner.py new file mode 100644 index 0000000000000000000000000000000000000000..e63cd3c563e0e8d8434d7a8f1dce3bdc9313a060 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/custom_ppo_loss_fn_learner.py @@ -0,0 +1,54 @@ +from typing import Any, Dict + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import ModuleID, TensorType + +torch, _ = try_import_torch() + + +class PPOTorchLearnerWithWeightRegularizerLoss(PPOTorchLearner): + """A custom PPO torch learner adding a weight regularizer term to the loss. + + We compute a naive regularizer term averaging over all parameters of the RLModule + and add this mean value (multiplied by the regularizer coefficient) to the base PPO + loss. + The experiment shows that even with a large learning rate, our custom Learner is + still able to learn properly as it's forced to keep the weights small. + """ + + @override(PPOTorchLearner) + def compute_loss_for_module( + self, + *, + module_id: ModuleID, + config: PPOConfig, + batch: Dict[str, Any], + fwd_out: Dict[str, TensorType], + ) -> TensorType: + + base_total_loss = super().compute_loss_for_module( + module_id=module_id, + config=config, + batch=batch, + fwd_out=fwd_out, + ) + + # Compute the mean of all the RLModule's weights. + parameters = self.get_parameters(self.module[module_id]) + mean_weight = torch.mean(torch.stack([w.mean() for w in parameters])) + + self.metrics.log_value( + key=(module_id, "mean_weight"), + value=mean_weight, + window=1, + ) + + total_loss = ( + base_total_loss + + config.learner_config_dict["regularizer_coeff"] * mean_weight + ) + + return total_loss diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/intrinsic_curiosity_learners.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/intrinsic_curiosity_learners.py new file mode 100644 index 0000000000000000000000000000000000000000..dd37dab0cb114d637a94d35897d11ef4082b629e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/intrinsic_curiosity_learners.py @@ -0,0 +1,162 @@ +from typing import Any, List, Optional + +import gymnasium as gym +import torch + +from ray.rllib.algorithms.dqn.torch.dqn_torch_learner import DQNTorchLearner +from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner +from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import ( + AddObservationsFromEpisodesToBatch, +) +from ray.rllib.connectors.common.numpy_to_tensor import NumpyToTensor +from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import ( # noqa + AddNextObservationsFromEpisodesToTrainBatch, +) +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core import Columns, DEFAULT_MODULE_ID +from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.typing import EpisodeType + +ICM_MODULE_ID = "_intrinsic_curiosity_model" + + +class DQNTorchLearnerWithCuriosity(DQNTorchLearner): + def build(self) -> None: + super().build() + add_intrinsic_curiosity_connectors(self) + + +class PPOTorchLearnerWithCuriosity(PPOTorchLearner): + def build(self) -> None: + super().build() + add_intrinsic_curiosity_connectors(self) + + +def add_intrinsic_curiosity_connectors(torch_learner: TorchLearner) -> None: + """Adds two connector pieces to the Learner pipeline, needed for ICM training. + + - The `AddNextObservationsFromEpisodesToTrainBatch` connector makes sure the train + batch contains the NEXT_OBS for ICM's forward- and inverse dynamics net training. + - The `IntrinsicCuriosityModelConnector` piece computes intrinsic rewards from the + ICM and adds the results to the extrinsic reward of the main module's train batch. + + Args: + torch_learner: The TorchLearner, to whose Learner pipeline the two ICM connector + pieces should be added. + """ + learner_config_dict = torch_learner.config.learner_config_dict + + # Assert, we are only training one policy (RLModule) and we have the ICM + # in our MultiRLModule. + assert ( + len(torch_learner.module) == 2 + and DEFAULT_MODULE_ID in torch_learner.module + and ICM_MODULE_ID in torch_learner.module + ) + + # Make sure both curiosity loss settings are explicitly set in the + # `learner_config_dict`. + if ( + "forward_loss_weight" not in learner_config_dict + or "intrinsic_reward_coeff" not in learner_config_dict + ): + raise KeyError( + "When using the IntrinsicCuriosityTorchLearner, both `forward_loss_weight` " + " and `intrinsic_reward_coeff` must be part of your config's " + "`learner_config_dict`! Add these values through: `config.training(" + "learner_config_dict={'forward_loss_weight': .., 'intrinsic_reward_coeff': " + "..})`." + ) + + if torch_learner.config.add_default_connectors_to_learner_pipeline: + # Prepend a "add-NEXT_OBS-from-episodes-to-train-batch" connector piece + # (right after the corresponding "add-OBS-..." default piece). + torch_learner._learner_connector.insert_after( + AddObservationsFromEpisodesToBatch, + AddNextObservationsFromEpisodesToTrainBatch(), + ) + # Append the ICM connector, computing intrinsic rewards and adding these to + # the main model's extrinsic rewards. + torch_learner._learner_connector.insert_after( + NumpyToTensor, + IntrinsicCuriosityModelConnector( + intrinsic_reward_coeff=( + torch_learner.config.learner_config_dict["intrinsic_reward_coeff"] + ) + ), + ) + + +class IntrinsicCuriosityModelConnector(ConnectorV2): + """Learner ConnectorV2 piece to compute intrinsic rewards based on an ICM. + + For more details, see here: + [1] Curiosity-driven Exploration by Self-supervised Prediction + Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017. + https://arxiv.org/pdf/1705.05363.pdf + + This connector piece: + - requires two RLModules to be present in the MultiRLModule: + DEFAULT_MODULE_ID (the policy model to be trained) and ICM_MODULE_ID (the instrinsic + curiosity architecture). + - must be located toward the end of to your Learner pipeline (after the + `NumpyToTensor` piece) in order to perform a forward pass on the ICM model with the + readily compiled batch and a following forward-loss computation to get the intrinsi + rewards. + - these intrinsic rewards will then be added to the (extrinsic) rewards in the main + model's train batch. + """ + + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + intrinsic_reward_coeff: float, + **kwargs, + ): + """Initializes a CountBasedCuriosity instance. + + Args: + intrinsic_reward_coeff: The weight with which to multiply the intrinsic + reward before adding it to the extrinsic rewards of the main model. + """ + super().__init__(input_observation_space, input_action_space) + + self.intrinsic_reward_coeff = intrinsic_reward_coeff + + def __call__( + self, + *, + rl_module: RLModule, + batch: Any, + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # Assert that the batch is ready. + assert DEFAULT_MODULE_ID in batch and ICM_MODULE_ID not in batch + assert ( + Columns.OBS in batch[DEFAULT_MODULE_ID] + and Columns.NEXT_OBS in batch[DEFAULT_MODULE_ID] + ) + # TODO (sven): We are performing two forward passes per update right now. + # Once here in the connector (w/o grad) to just get the intrinsic rewards + # and once in the learner to actually compute the ICM loss and update the ICM. + # Maybe we can save one of these, but this would currently harm the DDP-setup + # for multi-GPU training. + with torch.no_grad(): + # Perform ICM forward pass. + fwd_out = rl_module[ICM_MODULE_ID].forward_train(batch[DEFAULT_MODULE_ID]) + + # Add the intrinsic rewards to the main module's extrinsic rewards. + batch[DEFAULT_MODULE_ID][Columns.REWARDS] += ( + self.intrinsic_reward_coeff * fwd_out[Columns.INTRINSIC_REWARDS] + ) + + # Duplicate the batch such that the ICM also has data to learn on. + batch[ICM_MODULE_ID] = batch[DEFAULT_MODULE_ID] + + return batch diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/separate_vf_lr_and_optimizer_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/separate_vf_lr_and_optimizer_learner.py new file mode 100644 index 0000000000000000000000000000000000000000..7095503f5f620bb7d83ec50d4b2494bbf0a7ea3b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/separate_vf_lr_and_optimizer_learner.py @@ -0,0 +1,83 @@ +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner +from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import ModuleID + +torch, _ = try_import_torch() + + +class PPOTorchLearnerWithSeparateVfOptimizer(PPOTorchLearner): + """A custom PPO torch learner with 2 optimizers, for policy and value function. + + Overrides the Learner's standard `configure_optimizers_for_module()` method to + register the additional vf optimizer. + + The standard PPOLearner only uses a single optimizer (and single learning rate) to + update the model, regardless of whether the value function network + is separate from the policy network or whether they have shared components. + + We may leave the loss function of PPO completely untouched. It already returns a + sum of policy loss and vf loss (and entropy loss), and thus - given that the neural + networks used to compute each of these terms are separate and don't share any + components - gradients are computed separately per neural network (policy vs vf) + and applied separately through the two optimizers. + """ + + @override(TorchLearner) + def configure_optimizers_for_module( + self, + module_id: ModuleID, + config: "AlgorithmConfig" = None, + ) -> None: + """Registers 2 optimizers for the given ModuleID with this Learner.""" + + # Make sure the RLModule has the correct properties. + module = self.module[module_id] + # TODO (sven): We should move this into a new `ValueFunction` API, which + # should has-a `get_value_function_params` method. This way, any custom + # RLModule that implements this API can be used here, not just the standard + # PPO one. + assert ( + hasattr(module, "pi") + and hasattr(module, "vf") + and hasattr(module, "encoder") + and hasattr(module.encoder, "actor_encoder") + and hasattr(module.encoder, "critic_encoder") + ) + assert config.model_config["vf_share_layers"] is False + + # Get all policy-related parameters from the RLModule. + pi_params = ( + # Actor encoder and policy head. + self.get_parameters(self.module[module_id].encoder.actor_encoder) + + self.get_parameters(self.module[module_id].pi) + ) + # Register the policy optimizer. + self.register_optimizer( + module_id=module_id, + optimizer_name="optim_for_pi", + optimizer=torch.optim.Adam(params=pi_params), + params=pi_params, + # For the policy learning rate, we use the "main" lr in the AlgorithmConfig. + lr_or_lr_schedule=config.lr, + ) + + # Get all value function-related parameters from the RLModule. + vf_params = ( + # Critic encoder and value head. + self.get_parameters(self.module[module_id].encoder.critic_encoder) + + self.get_parameters(self.module[module_id].vf) + ) + # Register the value function optimizer. + self.register_optimizer( + module_id=module_id, + optimizer_name="optim_for_vf", + optimizer=torch.optim.Adam(params=vf_params), + params=vf_params, + # For the value function learning rate, we use a user-provided custom + # setting in the `learner_config_dict` in the AlgorithmConfig. If this + # is not provided, use the same lr as for the policy optimizer. + lr_or_lr_schedule=config.learner_config_dict.get("lr_vf", config.lr), + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/vpg_torch_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/vpg_torch_learner.py new file mode 100644 index 0000000000000000000000000000000000000000..f5aca70e135429407be06d975adac878a4abe8e7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/vpg_torch_learner.py @@ -0,0 +1,73 @@ +import torch +from typing import Any, Dict, TYPE_CHECKING + +import numpy as np + +from ray.rllib.connectors.learner import ComputeReturnsToGo +from ray.rllib.core.columns import Columns +from ray.rllib.core.learner.learner import Learner +from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.utils.annotations import override +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.typing import ModuleID, TensorType + +if TYPE_CHECKING: + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + + +class VPGTorchLearner(TorchLearner): + @override(TorchLearner) + def build(self) -> None: + super().build() + + # Prepend the returns-to-go connector piece to have that information + # available in the train batch. + if self.config.add_default_connectors_to_learner_pipeline: + self._learner_connector.prepend(ComputeReturnsToGo(gamma=self.config.gamma)) + + @override(TorchLearner) + def compute_loss_for_module( + self, + *, + module_id: ModuleID, + config: "AlgorithmConfig", + batch: Dict[str, Any], + fwd_out: Dict[str, TensorType], + ) -> TensorType: + rl_module = self.module[module_id] + + # Create the action distribution from the parameters output by the RLModule. + action_dist_inputs = fwd_out[Columns.ACTION_DIST_INPUTS] + action_dist_class = rl_module.get_train_action_dist_cls() + action_dist = action_dist_class.from_logits(action_dist_inputs) + + # Compute log probabilities of the actions taken during sampling. + log_probs = action_dist.logp(batch[Columns.ACTIONS]) + + # Compute the policy gradient loss. + # Since we're not using a baseline, we use returns to go directly. + loss = -torch.mean(log_probs * batch[Columns.RETURNS_TO_GO]) + + # Just for exercise, log the average return to go per discrete action. + for act, ret_to_go in zip(batch[Columns.ACTIONS], batch[Columns.RETURNS_TO_GO]): + self.metrics.log_value( + key=(module_id, f"action_{act}_return_to_go_mean"), + value=ret_to_go, + # Mean over the batch size. + reduce="mean", + window=len(batch[Columns.RETURNS_TO_GO]), + ) + + return loss + + @override(Learner) + def after_gradient_based_update(self, *, timesteps): + # This is to check if in the multi-gpu case, the weights across workers are + # the same. Only for testing purposes. + if self.config.report_mean_weights: + for module_id in self.module.keys(): + parameters = convert_to_numpy( + self.get_parameters(self.module[module_id]) + ) + mean_ws = np.mean([w.mean() for w in parameters]) + self.metrics.log_value((module_id, "mean_weight"), mean_ws, window=1) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_load_rl_modules.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_load_rl_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..61cfe730a46523cabe9c62542a49b86356a4cdb2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_load_rl_modules.py @@ -0,0 +1,78 @@ +import argparse +import gymnasium as gym +import shutil +import tempfile + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog +from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule +from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.rl_module import RLModuleSpec + + +def _parse_args(): + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--framework", + choices=["tf2", "torch"], # tf will be deprecated with the new Learner stack + default="torch", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = _parse_args() + + ray.init() + + # Create a module to load and save it to a checkpoint for testing purposes + # (this is not necessary in a real use case) + # In a real case you would just load the checkpoint from a rllib training run + # where you had enabled checkpointing, the learner api and the rl module api + module_class = PPOTfRLModule if args.framework == "tf2" else PPOTorchRLModule + env = gym.make("CartPole-v1") + module_to_load = RLModuleSpec( + module_class=module_class, + model_config=DefaultModelConfig(fcnet_hiddens=[32]), + catalog_class=PPOCatalog, + observation_space=env.observation_space, + action_space=env.action_space, + ).build() + + CHECKPOINT_DIR = tempfile.mkdtemp() + module_to_load.save_to_path(CHECKPOINT_DIR) + + # Create a module spec to load the checkpoint + module_to_load_spec = RLModuleSpec( + module_class=module_class, + model_config=DefaultModelConfig(fcnet_hiddens=[32]), + catalog_class=PPOCatalog, + load_state_path=CHECKPOINT_DIR, + ) + + # train a PPO algorithm with the loaded module + config = ( + PPOConfig() + .api_stack(enable_rl_module_and_learner=True) + .framework(args.framework) + .rl_module(rl_module_spec=module_to_load_spec) + .environment("CartPole-v1") + ) + + tuner = tune.Tuner( + "PPO", + param_space=config.to_dict(), + run_config=air.RunConfig( + stop={TRAINING_ITERATION: 1}, + failure_config=air.FailureConfig(fail_fast="raise"), + ), + ) + tuner.fit() + shutil.rmtree(CHECKPOINT_DIR) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_custom_loss_fn.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_custom_loss_fn.py new file mode 100644 index 0000000000000000000000000000000000000000..04cb17c6f8934201172df580a76cc9455e9847b2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_custom_loss_fn.py @@ -0,0 +1,138 @@ +"""Example of how to write a custom loss function (based on the existing PPO loss). + +This example shows: + - how to subclass an existing (torch) Learner and override its + `compute_loss_for_module()` method. + - how you can add your own loss terms to the subclassed "base loss", in this + case here a weights regularizer term with the intention to keep the learnable + parameters of the RLModule reasonably small. + - how to add custom settings (here: the regularizer coefficient) to the + `AlgorithmConfig` in order to not have to subclass and write your own + (you could still do that, but are not required to). + - how to plug in the custom Learner into your config and then run the + experiment. + +See the :py:class:`~ray.rllib.examples.learners.classes.custom_loss_fn_learner.PPOTorchLearnerWithWeightRegularizerLoss` # noqa +class for details on how to override the main (PPO) loss function. + +We compute a naive regularizer term averaging over all parameters of the RLModule and +add this mean value (multiplied by the regularizer coefficient) to the base PPO loss. +The experiment shows that even with a large learning rate, our custom Learner is still +able to learn properly as it's forced to keep the weights small. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --regularizer-coeff=0.02 +--lr=0.01` + +Use the `--regularizer-coeff` option to set the value of the coefficient with which +the mean NN weight is being multiplied (inside the total loss) and the `--lr` option +to set the learning rate. Experiments using a large learning rate and no regularization +(`--regularizer-coeff=0.0`) should NOT learn a decently working policy. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see that - given a large learning rate - only with +weight regularization (`--regularizer-coeff` > 0.0), the algo has a chance to learn +a decent policy: + +With --regularizer-coeff=0.02 and --lr=0.01 +(trying to reach 250.0 return on CartPole in 100k env steps): ++-----------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-----------------------------+------------+-----------------+--------+ +| PPO_CartPole-v1_4a3a0_00000 | TERMINATED | 127.0.0.1:16845 | 18 | ++-----------------------------+------------+-----------------+--------+ ++------------------+------------------------+---------------------+ +| total time (s) | num_env_steps_sampled_ | episode_return_mean | +| | _lifetime | | +|------------------+------------------------+---------------------+ +| 16.8842 | 72000 | 256.35 | ++------------------+------------------------+---------------------+ + +With --regularizer-coeff=0.0 and --lr=0.01 +(trying to reach 250.0 return on CartPole in 100k env steps): + +[HAS SIGNIFICANT PROBLEMS REACHING THE DESIRED RETURN] +""" + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.learners.classes.custom_ppo_loss_fn_learner import ( + PPOTorchLearnerWithWeightRegularizerLoss, +) +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + +torch, _ = try_import_torch() + + +parser = add_rllib_example_script_args( + default_reward=250.0, + default_timesteps=200000, +) +parser.set_defaults(enable_new_api_stack=True) +parser.add_argument( + "--regularizer-coeff", + type=float, + default=0.02, + help="The coefficient with which to multiply the mean NN-weight by (and then add " + "the result of this operation to the main loss term).", +) +parser.add_argument( + "--lr", + type=float, + default=0.01, + help="The learning rate to use.", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + assert args.algo == "PPO", "Must set --algo=PPO when running this script!" + + base_config = ( + PPOConfig() + .environment("CartPole-v1") + .training( + # This is the most important setting in this script: We point our PPO + # algorithm to use the custom Learner (instead of the default + # PPOTorchLearner). + learner_class=PPOTorchLearnerWithWeightRegularizerLoss, + # We use this simple method here to inject a new setting that our + # custom Learner class uses in its loss function. This is convenient + # and avoids having to subclass `PPOConfig` only to add a few new settings + # to it. Within our Learner, we can access this new setting through: + # `self.config.learner_config_dict['regularizer_coeff']` + learner_config_dict={"regularizer_coeff": args.regularizer_coeff}, + # Some settings to make this example learn better. + num_epochs=6, + vf_loss_coeff=0.01, + # The learning rate, settable through the command line `--lr` arg. + lr=args.lr, + ) + .rl_module( + model_config=DefaultModelConfig(vf_share_layers=True), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_torch_lr_schedulers.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_torch_lr_schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..2051076613c3dbc85e1ba9590bd09ef3d0a30236 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/ppo_with_torch_lr_schedulers.py @@ -0,0 +1,209 @@ +"""Example of how to use PyTorch's learning rate schedulers to design a complex +learning rate schedule for training. + +Two learning rate schedules are applied in sequence to the learning rate of the +optimizer. In this way even more complex learning rate schedules can be assembled. + +This example shows: + - how to configure multiple learning rate schedulers, as a chained pipeline, in + PyTorch using partial initialization with `functools.partial`. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --lr-const-factor=0.9 +--lr-const-iters=10 --lr-exp-decay=0.9` + +Use the `--lr-const-factor` to define the facotr by which to multiply the +learning rate in the first `--lr-const-iters` iterations. Use the +`--lr-const-iters` to set the number of iterations in which the learning rate +should be adapted by the `--lr-const-factor`. Use `--lr-exp-decay` to define +the learning rate decay to be applied after the constant factor multiplication. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should expect to observe decent learning behavior from your console output: + +With `--lr-const-factor=0.1`, `--lr-const-iters=10, and `--lr-exp_decay=0.3`. ++-----------------------------+------------+--------+------------------+ +| Trial name | status | iter | total time (s) | +| | | | | +|-----------------------------+------------+--------+------------------+ +| PPO_CartPole-v1_7fc44_00000 | TERMINATED | 50 | 59.6542 | ++-----------------------------+------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| episode_return_mean | num_episodes_lifetime | num_env_steps_traine | +| | | d_lifetime | ++------------------------+------------------------+------------------------| +| 451.2 | 9952 | 210047 | ++------------------------+------------------------+------------------------+ +""" +import functools +import numpy as np +from typing import Optional + +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core.learner.learner import DEFAULT_OPTIMIZER +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +torch, _ = try_import_torch() + + +class LRChecker(RLlibCallback): + def on_algorithm_init( + self, + *, + algorithm: "Algorithm", + metrics_logger: Optional[MetricsLogger] = None, + **kwargs, + ) -> None: + # Store the expected learning rates for each iteration. + self.lr = [] + # Retrieve the chosen configuration parameters from the config. + lr_factor = algorithm.config._torch_lr_scheduler_classes[0].keywords["factor"] + lr_total_iters = algorithm.config._torch_lr_scheduler_classes[0].keywords[ + "total_iters" + ] + lr_gamma = algorithm.config._torch_lr_scheduler_classes[1].keywords["gamma"] + # Compute the learning rates for all iterations up to `lr_const_iters`. + for i in range(1, lr_total_iters + 1): + # The initial learning rate. + lr = algorithm.config.lr + # In the first 10 iterations we multiply by `lr_const_factor`. + if i < lr_total_iters: + lr *= lr_factor + # Finally, we have an exponential decay of `lr_exp_decay`. + lr *= lr_gamma**i + self.lr.append(lr) + + def on_train_result( + self, + *, + algorithm: "Algorithm", + metrics_logger: Optional[MetricsLogger] = None, + result: dict, + **kwargs, + ) -> None: + + # Check for the first `lr_total_iters + 1` iterations, if expected + # and actual learning rates correspond. + if ( + algorithm.training_iteration + <= algorithm.config._torch_lr_scheduler_classes[0].keywords["total_iters"] + ): + actual_lr = algorithm.learner_group._learner.get_optimizer( + DEFAULT_MODULE_ID, DEFAULT_OPTIMIZER + ).param_groups[0]["lr"] + # Assert the learning rates are close enough. + assert np.isclose( + actual_lr, + self.lr[algorithm.training_iteration - 1], + atol=1e-9, + rtol=1e-9, + ) + + +parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=250000) +parser.set_defaults(enable_new_api_stack=True) +parser.add_argument( + "--lr-const-factor", + type=float, + default=0.9, + help="The factor by which the learning rate should be multiplied.", +) +parser.add_argument( + "--lr-const-iters", + type=int, + default=10, + help=( + "The number of iterations by which the learning rate should be " + "multiplied by the factor." + ), +) +parser.add_argument( + "--lr-exp-decay", + type=float, + default=0.99, + help="The rate by which the learning rate should exponentially decay.", +) + +if __name__ == "__main__": + # Use `parser` to add your own custom command line options to this script + # and (if needed) use their values to set up `config` below. + args = parser.parse_args() + + config = ( + PPOConfig() + .environment("CartPole-v1") + .training( + lr=0.03, + num_sgd_iter=6, + vf_loss_coeff=0.01, + ) + .rl_module( + model_config=DefaultModelConfig( + fcnet_hiddens=[32], + fcnet_activation="linear", + vf_share_layers=True, + ), + ) + .evaluation( + evaluation_num_env_runners=1, + evaluation_interval=1, + evaluation_parallel_to_training=True, + evaluation_config=PPOConfig.overrides(exploration=False), + ) + .experimental( + # Add two learning rate schedulers to be applied in sequence. + _torch_lr_scheduler_classes=[ + # Multiplies the learning rate by a factor of 0.1 for 10 iterations. + functools.partial( + torch.optim.lr_scheduler.ConstantLR, + factor=args.lr_const_factor, + total_iters=args.lr_const_iters, + ), + # Decays the learning rate after each gradients step by + # `args.lr_exp_decay`. + functools.partial( + torch.optim.lr_scheduler.ExponentialLR, gamma=args.lr_exp_decay + ), + ] + ) + .callbacks( + LRChecker, + ) + ) + + stop = { + f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": ( + args.stop_reward + ), + } + + if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/separate_vf_lr_and_optimizer.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/separate_vf_lr_and_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..1e5359f1162b9b280e08273c81cb10284d1d8d19 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/separate_vf_lr_and_optimizer.py @@ -0,0 +1,135 @@ +"""Example of how to run any value function based algo (e.g. PPO) with 2 optimizers. + +One optimizer (with its own learning rate and other configurations) is responsible for +updating the policy network, the other (with its own learning rate and other +configurations) for updating the value function network. + +This example shows: + - how to subclass an existing (torch) Learner and override its + `configure_optimizers_for_module()` method. + - how to call `Learner.register_optimizer()` from within your custom + `configure_optimizers_for_module()` method in order to specify, which optimizer + (type, learning rate, other settings) is responsible for which neural network + parameters. + - how to add custom settings (here: the additional learning rate for the + vf-optimizer) to the `AlgorithmConfig` in order to not have to subclass and write + your own (you could still do that, but are not required to). + - how to plug in the custom Learner into your config and then run the + experiment. + +See the :py:class:`~ray.rllib.examples.learners.classes.separate_vf_lr_and_optimizer_learner.PPOTorchLearnerWithSeparateVfOptimizer` # noqa +class for details on how to override the main (torch) `configure_optimizers_for_module` +function. + +We assume here that the users properly sets up their RLModule to have separate policy- +and value function networks. If any model pieces are shared between the two optimizers, +you should experience learning instability up to the point where your algorithm can't +learn any useful policy anymore. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --lr-vf=0.001 --lr-policy=0.0005` + +Use the `--lr-policy` option to set the policy learning rate (used by the policy +optimizer) and the `--lr-vf` option to set the value function learning rate (used by the +value function optimizer). + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should expect to observe decent learning behavior from your console output: + +With --lr-vf=0.0005 and --lr-policy=0.001 ++-----------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-----------------------------+------------+-----------------+--------+ +| PPO_CartPole-v1_7b404_00000 | TERMINATED | 127.0.0.1:16845 | 19 | ++-----------------------------+------------+-----------------+--------+ ++------------------+------------------------+---------------------+ +| total time (s) | num_env_steps_sampled_ | episode_return_mean | +| | _lifetime | | +|------------------+------------------------+---------------------+ +| 19.4179 | 76000 | 459.94 | ++------------------+------------------------+---------------------+ +""" + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.learners.classes.separate_vf_lr_and_optimizer_learner import ( + PPOTorchLearnerWithSeparateVfOptimizer, +) +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + +torch, _ = try_import_torch() + + +parser = add_rllib_example_script_args(default_reward=450.0) +parser.set_defaults(enable_new_api_stack=True) +parser.add_argument( + "--lr-vf", + type=float, + default=0.0005, + help="The learning rate used in the value function optimizer.", +) +parser.add_argument( + "--lr-policy", + type=float, + default=0.001, + help="The learning rate used in the policy optimizer.", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + assert args.algo == "PPO", "Must set --algo=PPO when running this script!" + + base_config = ( + PPOConfig() + .environment("CartPole-v1") + .training( + # This is the most important setting in this script: We point our PPO + # algorithm to use the custom Learner (instead of the default + # PPOTorchLearner). + learner_class=PPOTorchLearnerWithSeparateVfOptimizer, + # We use this simple method here to inject a new setting that our + # custom Learner class uses in its `configure_optimizers_for_module` + # method. This is convenient and avoids having to subclass `PPOConfig` only + # to add a few new settings to it. Within our Learner, we can access this + # new setting through: + # `self.config.learner_config_dict['lr_vf']` + learner_config_dict={"lr_vf": args.lr_vf}, + # Some settings to make this example learn better. + num_epochs=6, + # Since we are using separate optimizers for the two NN components, the + # value of `vf_loss_coeff` does not matter anymore. We set this to 1.0 here. + vf_loss_coeff=1.0, + # The policy learning rate, settable through the command line `--lr` arg. + lr=args.lr_policy, + ) + .rl_module( + # Another very important setting is this here. Make sure you use + # completely separate NNs for policy and value-functions. + model_config=DefaultModelConfig(vf_share_layers=False), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..338b7e115c41286dd2ddae59a801d57eb6f4b022 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/cartpole_recording.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/cartpole_recording.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50cbb9d416c7f77693f1cabf7859804b36dec33b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/cartpole_recording.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/custom_input_api.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/custom_input_api.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff9c77c18542a42c5fd0e9c5686443df15015506 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/custom_input_api.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9192020b4ab9f9e6534f7bbc6e24524c65b3ef96 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl_with_image_data.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl_with_image_data.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91f967836a1553fc15426922b57a03884e4d2ef8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/offline_rl_with_image_data.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/pretrain_bc_single_agent_evaluate_as_multi_agent.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/pretrain_bc_single_agent_evaluate_as_multi_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c77868e595c8e5d126c1e7df9447b8d6f4480ef7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/pretrain_bc_single_agent_evaluate_as_multi_agent.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/saving_experiences.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/saving_experiences.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df24b9257002faf07a3f1a2e20bbd5b6171c2143 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/saving_experiences.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/train_w_bc_finetune_w_ppo.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/train_w_bc_finetune_w_ppo.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af6b9a73b4177be1d0918da1aeadd415b5a55529 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/__pycache__/train_w_bc_finetune_w_ppo.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_data.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_data.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..111ef7d48d2ee29b8bed3e6910beda10ad5e1a63 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_data.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_prelearner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_prelearner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e606c31a156bd30ab51159d098e33ce8a5a0fe4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/image_offline_prelearner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_data.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_data.py new file mode 100644 index 0000000000000000000000000000000000000000..4f4ab5f5116fa3e625aa5440bb36ed28c4728e29 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_data.py @@ -0,0 +1,73 @@ +import io +import logging +import numpy as np + +from PIL import Image +from typing import Any, Dict + +from ray import data +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.offline.offline_data import OfflineData +from ray.rllib.offline.offline_prelearner import OfflinePreLearner +from ray.rllib.utils.annotations import override + +logger = logging.getLogger(__name__) + + +class ImageOfflineData(OfflineData): + """This class overrides `OfflineData` to read in raw image data. + + The image data is from Ray Data`s S3 example bucket, namely + `ray-example-data/batoidea/JPEGImages/`. + To read in this data the raw bytes have to be decoded and then + converted to `numpy` arrays. Each image array has a dimension + (32, 32, 3). + + To just read in the raw image data and convert it to arrays it + suffices to override the `OfflineData.__init__` method only. + Note, that further transformations of the data - specifically + into `SingleAgentEpisode` data - will be performed in a custom + `OfflinePreLearner` defined in the `image_offline_prelearner` + file. You could hard-code the usage of this prelearner here, + but you will use the `prelearner_class` attribute in the + `AlgorithmConfig` instead. + """ + + @override(OfflineData) + def __init__(self, config: AlgorithmConfig): + + # Set class attributes. + self.config = config + self.is_multi_agent = self.config.is_multi_agent + self.materialize_mapped_data = False + self.path = self.config.input_ + + self.data_read_batch_size = self.config.input_read_batch_size + self.data_is_mapped = False + + # Define your function to map images to numpy arrays. + def map_to_numpy(row: Dict[str, Any]) -> Dict[str, Any]: + # Convert to byte stream. + bytes_stream = io.BytesIO(row["bytes"]) + # Convert to image. + image = Image.open(bytes_stream) + # Return an array of the image. + return {"array": np.array(image)} + + try: + # Load the dataset and transform to arrays on-the-fly. + self.data = data.read_binary_files(self.path).map(map_to_numpy) + except Exception as e: + logger.error(e) + + # Define further attributes needed in the `sample` method. + self.batch_iterator = None + self.map_batches_kwargs = self.config.map_batches_kwargs + self.iter_batches_kwargs = self.config.iter_batches_kwargs + # Use a custom OfflinePreLearner if needed. + self.prelearner_class = self.config.prelearner_class or OfflinePreLearner + + # For remote learner setups. + self.locality_hints = None + self.learner_handles = None + self.module_spec = None diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_prelearner.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_prelearner.py new file mode 100644 index 0000000000000000000000000000000000000000..001af304929ec2554fb6f3936ec788f42ab2f261 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/image_offline_prelearner.py @@ -0,0 +1,101 @@ +import gymnasium as gym +import numpy as np +import random +import uuid + +from typing import Any, Dict, List, Optional, Tuple, Union + +from ray.actor import ActorHandle +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.core.learner.learner import Learner +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.env.single_agent_episode import SingleAgentEpisode +from ray.rllib.offline.offline_prelearner import OfflinePreLearner, SCHEMA +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import EpisodeType, ModuleID + + +class ImageOfflinePreLearner(OfflinePreLearner): + """This class transforms image data to `MultiAgentBatch`es. + + While the `ImageOfflineData` class transforms raw image + bytes to `numpy` arrays, this class maps these data in + `SingleAgentEpisode` instances through the learner connector + pipeline and finally outputs a >`MultiAgentBatch` ready for + training in RLlib's `Learner`s. + + Note, the basic transformation from images to `SingleAgentEpisode` + instances creates synthetic data that does not rely on any MDP + and therefore no agent can learn from it. However, this example + should show how to transform data into this form through + overriding the `OfflinePreLearner`. + """ + + def __init__( + self, + config: "AlgorithmConfig", + learner: Union[Learner, List[ActorHandle]], + spaces: Optional[Tuple[gym.Space, gym.Space]] = None, + module_spec: Optional[MultiRLModuleSpec] = None, + module_state: Optional[Dict[ModuleID, Any]] = None, + **kwargs: Dict[str, Any], + ): + # Set up necessary class attributes. + self.config = config + self.action_space = spaces[1] + self.observation_space = spaces[0] + self.input_read_episodes = self.config.input_read_episodes + self.input_read_sample_batches = self.config.input_read_sample_batches + self._policies_to_train = "default_policy" + self._is_multi_agent = False + + # Build the `MultiRLModule` needed for the learner connector. + self._module = module_spec.build() + + # Build the learner connector pipeline. + self._learner_connector = self.config.build_learner_connector( + input_observation_space=self.observation_space, + input_action_space=self.action_space, + ) + + @override(OfflinePreLearner) + @staticmethod + def _map_to_episodes( + is_multi_agent: bool, + batch: Dict[str, Union[list, np.ndarray]], + schema: Dict[str, str] = SCHEMA, + to_numpy: bool = False, + input_compress_columns: Optional[List[str]] = None, + observation_space: gym.Space = None, + action_space: gym.Space = None, + **kwargs: Dict[str, Any], + ) -> Dict[str, List[EpisodeType]]: + + # Define a container for the episodes. + episodes = [] + + # Batches come in as numpy arrays. + for i, obs in enumerate(batch["array"]): + + # Construct your episode. + episode = SingleAgentEpisode( + id_=uuid.uuid4().hex, + observations=[obs, obs], + observation_space=observation_space, + actions=[action_space.sample()], + action_space=action_space, + rewards=[random.random()], + terminated=True, + truncated=False, + len_lookback_buffer=0, + t_started=0, + ) + + # Numpy'ize, if necessary. + if to_numpy: + episode.to_numpy() + + # Store the episode in the container. + episodes.append(episode) + + return {"episodes": episodes} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/custom_input_api.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/custom_input_api.py new file mode 100644 index 0000000000000000000000000000000000000000..789e64a2a357ee64069081cdf2d0a69ac975d5c7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/custom_input_api.py @@ -0,0 +1,134 @@ +# @OldAPIStack + +"""Example of creating a custom input API + +Custom input apis are useful when your data source is in a custom format or +when it is necessary to use an external data loading mechanism. +In this example, we train an rl agent on user specified input data. +Instead of using the built in JsonReader, we will create our own custom input +api, and show how to pass config arguments to it. + +To train CQL on the pendulum environment: +$ python custom_input_api.py --input-files=../tests/data/pendulum/enormous.zip +""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.offline import JsonReader, ShuffledInput, IOContext, InputReader +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, +) +from ray.tune.registry import get_trainable_cls, register_input + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="CQL", help="The RLlib-registered algorithm to use." +) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument("--stop-iters", type=int, default=100) +parser.add_argument( + "--input-files", + type=str, + default=os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "../../tests/data/pendulum/small.json", + ), +) + + +class CustomJsonReader(JsonReader): + """ + Example custom InputReader implementation (extended from JsonReader). + + This gets wrapped in ShuffledInput to comply with offline rl algorithms. + """ + + def __init__(self, ioctx: IOContext): + """ + The constructor must take an IOContext to be used in the input config. + Args: + ioctx: use this to access the `input_config` arguments. + """ + super().__init__(ioctx.input_config["input_files"], ioctx) + + +def input_creator(ioctx: IOContext) -> InputReader: + """ + The input creator method can be used in the input registry or set as the + config["input"] parameter. + + Args: + ioctx: use this to access the `input_config` arguments. + + Returns: + instance of ShuffledInput to work with some offline rl algorithms + """ + return ShuffledInput(CustomJsonReader(ioctx)) + + +if __name__ == "__main__": + ray.init() + args = parser.parse_args() + + # make absolute path because relative path looks in result directory + args.input_files = os.path.abspath(args.input_files) + + # we register our custom input creator with this convenient function + register_input("custom_input", input_creator) + + # Config modified from rllib/tuned_examples/cql/pendulum-cql.yaml + default_config = get_trainable_cls(args.run).get_default_config() + config = ( + default_config.environment("Pendulum-v1", clip_actions=True) + .framework(args.framework) + .offline_data( + # We can either use the tune registry ... + input_="custom_input", + # ... full classpath + # input_: "ray.rllib.examples.offline_rl.custom_input_api.CustomJsonReader" + # ... or a direct function to connect our input api. + # input_: input_creator + input_config={"input_files": args.input_files}, # <- passed to IOContext + actions_in_input_normalized=True, + ) + .training(train_batch_size=2000) + .evaluation( + evaluation_interval=1, + evaluation_num_env_runners=2, + evaluation_duration=10, + evaluation_parallel_to_training=True, + evaluation_config=default_config.overrides( + input_="sampler", + explore=False, + ), + ) + .reporting(metrics_num_episodes_for_smoothing=5) + ) + + if args.run == "CQL": + config.training( + twin_q=True, + num_steps_sampled_before_learning_starts=0, + bc_iters=100, + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -600, + } + + tuner = tune.Tuner( + args.run, param_space=config, run_config=air.RunConfig(stop=stop, verbose=1) + ) + tuner.fit() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl.py new file mode 100644 index 0000000000000000000000000000000000000000..5679fc1ac63b3eb403864ef94d1245736623388b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/offline_rl.py @@ -0,0 +1,167 @@ +# @OldAPIStack + +"""Example on how to use CQL to learn from an offline JSON file. + +Important node: Make sure that your offline data file contains only +a single timestep per line to mimic the way SAC pulls samples from +the buffer. + +Generate the offline json file by running an SAC algo until it reaches expert +level on your command line. For example: +$ cd ray +$ rllib train -f rllib/tuned_examples/sac/pendulum-sac.yaml --no-ray-ui + +Also make sure that in the above SAC yaml file (pendulum-sac.yaml), +you specify an additional "output" key with any path on your local +file system. In that path, the offline json files will be written to. + +Use the generated file(s) as "input" in the CQL config below +(`config["input"] = [list of your json files]`), then run this script. +""" + +import argparse +import numpy as np + +from ray.rllib.policy.sample_batch import convert_ma_batch_to_sample_batch +from ray.rllib.algorithms import cql as cql +from ray.rllib.execution.rollout_ops import ( + synchronous_parallel_sample, +) +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, +) + +torch, _ = try_import_torch() + +parser = argparse.ArgumentParser() +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=5, help="Number of iterations to train." +) +parser.add_argument( + "--stop-reward", type=float, default=50.0, help="Reward at which we stop training." +) + + +if __name__ == "__main__": + args = parser.parse_args() + + # See rllib/tuned_examples/cql/pendulum-cql.yaml for comparison. + config = ( + cql.CQLConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .framework(framework="torch") + .env_runners(num_env_runners=0) + .training( + n_step=3, + bc_iters=0, + clip_actions=False, + tau=0.005, + target_entropy="auto", + q_model_config={ + "fcnet_hiddens": [256, 256], + "fcnet_activation": "relu", + }, + policy_model_config={ + "fcnet_hiddens": [256, 256], + "fcnet_activation": "relu", + }, + optimization_config={ + "actor_learning_rate": 3e-4, + "critic_learning_rate": 3e-4, + "entropy_learning_rate": 3e-4, + }, + train_batch_size=256, + target_network_update_freq=1, + num_steps_sampled_before_learning_starts=256, + ) + .reporting(min_train_timesteps_per_iteration=1000) + .debugging(log_level="INFO") + .environment("Pendulum-v1", normalize_actions=True) + .offline_data( + input_config={ + "paths": ["tests/data/pendulum/enormous.zip"], + "format": "json", + } + ) + .evaluation( + evaluation_num_env_runners=1, + evaluation_interval=1, + evaluation_duration=10, + evaluation_parallel_to_training=False, + evaluation_config=cql.CQLConfig.overrides(input_="sampler"), + ) + ) + # evaluation_parallel_to_training should be False b/c iterations are very long + # and this would cause evaluation to lag one iter behind training. + + # Check, whether we can learn from the given file in `num_iterations` + # iterations, up to a reward of `min_reward`. + num_iterations = 5 + min_reward = -300 + + cql_algorithm = cql.CQL(config=config) + learnt = False + for i in range(num_iterations): + print(f"Iter {i}") + eval_results = cql_algorithm.train().get(EVALUATION_RESULTS) + if eval_results: + print( + "... R={}".format(eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]) + ) + # Learn until some reward is reached on an actual live env. + if eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] >= min_reward: + # Test passed gracefully. + if args.as_test: + print("Test passed after {} iterations.".format(i)) + quit(0) + learnt = True + break + + # Get policy and model. + cql_policy = cql_algorithm.get_policy() + cql_model = cql_policy.model + + # If you would like to query CQL's learnt Q-function for arbitrary + # (cont.) actions, do the following: + obs_batch = torch.from_numpy(np.random.random(size=(5, 3))) + action_batch = torch.from_numpy(np.random.random(size=(5, 1))) + q_values = cql_model.get_q_values(obs_batch, action_batch)[0] + # If you are using the "twin_q", there'll be 2 Q-networks and + # we usually consider the min of the 2 outputs, like so: + twin_q_values = cql_model.get_twin_q_values(obs_batch, action_batch)[0] + final_q_values = torch.min(q_values, twin_q_values)[0] + print(f"final_q_values={final_q_values.detach().numpy()}") + + # Example on how to do evaluation on the trained Algorithm. + # using the data from our buffer. + # Get a sample (MultiAgentBatch). + + batch = synchronous_parallel_sample(worker_set=cql_algorithm.env_runner_group) + batch = convert_ma_batch_to_sample_batch(batch) + obs = torch.from_numpy(batch["obs"]) + # Pass the observations through our model to get the + # features, which then to pass through the Q-head. + model_out, _ = cql_model({"obs": obs}) + # The estimated Q-values from the (historic) actions in the batch. + q_values_old = cql_model.get_q_values( + model_out, torch.from_numpy(batch["actions"]) + )[0] + # The estimated Q-values for the new actions computed by our policy. + actions_new = cql_policy.compute_actions_from_input_dict({"obs": obs})[0] + q_values_new = cql_model.get_q_values(model_out, torch.from_numpy(actions_new))[0] + print(f"Q-val batch={q_values_old.detach().numpy()}") + print(f"Q-val policy={q_values_new.detach().numpy()}") + + cql_algorithm.stop() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py new file mode 100644 index 0000000000000000000000000000000000000000..f3e52b35526dcd95abdc4efd39769b3d86a8a5b2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/train_w_bc_finetune_w_ppo.py @@ -0,0 +1,300 @@ +"""Example of training a custom RLModule with BC first, then finetuning it with PPO. + +This example: + - demonstrates how to write a very simple custom BC RLModule. + - run a quick BC training experiment with the custom module and learn CartPole + until some episode return A, while checkpointing each iteration. + - shows how subclass the custom BC RLModule, add the ValueFunctionAPI to the + new class, and add a value-function branch and an implementation of + `compute_values` to the original model to make it work with a value-based algo + like PPO. + - shows how to plug this new PPO-capable RLModule (including its checkpointed state + from the BC run) into your algorithm's config. + - confirms that even after 1-2 training iterations with PPO, no catastrophic + forgetting occurs (due to the additional value function branch and the switched + optimizer). + - uses Tune and RLlib to continue training the model until a higher return of B + is reached. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can first see BC's performance until return A is reached: ++----------------------------+------------+----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|----------------------------+------------+----------------+--------+ +| BC_CartPole-v1_95ba0_00000 | TERMINATED | 127.0.0.1:1515 | 51 | ++----------------------------+------------+----------------+--------+ ++------------------+------------------------+------------------------+ +| total time (s) | episode_return_mean | num_env_steps_traine | +| | | d_lifetime | +|------------------+------------------------|------------------------| +| 11.4828 | 250.5 | 42394 | ++------------------+------------------------+------------------------+ + +The script should confirm that no catastrophic forgetting has taken place: + +PPO return after initialization: 292.3 +PPO return after 2x training: 276.85 + +Then, after PPO training, you should see something like this (higher return): ++-----------------------------+------------+----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-----------------------------+------------+----------------+--------+ +| PPO_CartPole-v1_e07ac_00000 | TERMINATED | 127.0.0.1:6032 | 37 | ++-----------------------------+------------+----------------+--------+ + ++------------------+------------------------+------------------------+ +| total time (s) | episode_return_mean | num_episodes_lifetime | +| | | | ++------------------+------------------------+------------------------+ +| 32.7647 | 450.76 | 406 | ++------------------+------------------------+------------------------+ +""" +from pathlib import Path + +from torch import nn + +from ray.rllib.algorithms.bc import BCConfig +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core import ( + COMPONENT_LEARNER_GROUP, + COMPONENT_LEARNER, + COMPONENT_RL_MODULE, +) +from ray.rllib.core.columns import Columns +from ray.rllib.core.models.base import ENCODER_OUT +from ray.rllib.core.models.configs import MLPEncoderConfig, MLPHeadConfig +from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI +from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleSpec +from ray.rllib.core.rl_module.torch import TorchRLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) + +parser = add_rllib_example_script_args() +parser.set_defaults( + enable_new_api_stack=True, + env="CartPole-v1", + checkpoint_freq=1, +) + + +class MyBCModel(TorchRLModule): + """A very simple BC-usable model that only computes action logits.""" + + @override(TorchRLModule) + def setup(self): + # Create an encoder trunk. + # Observations are directly passed through it and feature vectors are output. + self._encoder = MLPEncoderConfig( + input_dims=[4], # CartPole + hidden_layer_dims=[256, 256], + hidden_layer_activation="relu", + output_layer_dim=None, + ).build(framework="torch") + + # The policy head sitting on top of the encoder. Feature vectors come in as + # input and action logits are output. + self._pi = MLPHeadConfig( + input_dims=[256], # from encoder + hidden_layer_dims=[256], # pi head + hidden_layer_activation="relu", + output_layer_dim=2, # CartPole + output_layer_activation="linear", + ).build(framework="torch") + + @override(TorchRLModule) + def _forward_inference(self, batch, **kwargs): + return {Columns.ACTION_DIST_INPUTS: self._pi(self._encoder(batch)[ENCODER_OUT])} + + @override(RLModule) + def _forward_exploration(self, batch, **kwargs): + return self._forward_inference(batch) + + @override(RLModule) + def _forward_train(self, batch, **kwargs): + return self._forward_inference(batch) + + +class MyPPOModel(MyBCModel, ValueFunctionAPI): + """Subclass of our simple BC model, but implementing the ValueFunctionAPI. + + Implementing the `compute_values` method makes this RLModule usable by algos + like PPO. + """ + + @override(MyBCModel) + def setup(self): + # Call super setup to create encoder trunk and policy head. + super().setup() + # Create the new value function head and zero-initialize it to not cause too + # much disruption. + self._vf = MLPHeadConfig( + input_dims=[256], # from encoder + hidden_layer_dims=[256], # pi head + hidden_layer_activation="relu", + hidden_layer_weights_initializer=nn.init.zeros_, + hidden_layer_bias_initializer=nn.init.zeros_, + output_layer_dim=1, # 1=value node + output_layer_activation="linear", + output_layer_weights_initializer=nn.init.zeros_, + output_layer_bias_initializer=nn.init.zeros_, + ).build(framework="torch") + + @override(MyBCModel) + def _forward_train(self, batch, **kwargs): + features = self._encoder(batch)[ENCODER_OUT] + logits = self._pi(features) + vf_out = self._vf(features).squeeze(-1) + return { + Columns.ACTION_DIST_INPUTS: logits, + Columns.VF_PREDS: vf_out, + } + + @override(ValueFunctionAPI) + def compute_values(self, batch, embeddings=None): + # Compute embeddings ... + if embeddings is None: + embeddings = self._encoder(batch)[ENCODER_OUT] + # then values using our value head. + return self._vf(embeddings).squeeze(-1) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.env == "CartPole-v1", "This example works only with --env=CartPole-v1!" + + # Define the data paths for our CartPole large dataset. + base_path = Path(__file__).parents[2] + assert base_path.is_dir(), base_path + data_path = base_path / "tests/data/cartpole/cartpole-v1_large" + assert data_path.is_dir(), data_path + print(f"data_path={data_path}") + + # Define the BC config. + base_config = ( + BCConfig() + # Note, the `input_` argument is the major argument for the + # new offline API. Via the `input_read_method_kwargs` the + # arguments for the `ray.data.Dataset` read method can be + # configured. The read method needs at least as many blocks + # as remote learners. + .offline_data( + input_=[data_path.as_posix()], + # Define the number of reading blocks, these should be larger than 1 + # and aligned with the data size. + input_read_method_kwargs={ + "override_num_blocks": max((args.num_learners or 1) * 2, 2) + }, + # Concurrency defines the number of processes that run the + # `map_batches` transformations. This should be aligned with the + # 'prefetch_batches' argument in 'iter_batches_kwargs'. + map_batches_kwargs={"concurrency": 2, "num_cpus": 2}, + # This data set is small so do not prefetch too many batches and use no + # local shuffle. + iter_batches_kwargs={ + "prefetch_batches": 1, + "local_shuffle_buffer_size": None, + }, + # The number of iterations to be run per learner when in multi-learner + # mode in a single RLlib training iteration. Leave this to `None` to + # run an entire epoch on the dataset during a single RLlib training + # iteration. For single-learner mode, 1 is the only option. + dataset_num_iters_per_learner=1 if not args.num_learners else None, + ).training( + train_batch_size_per_learner=1024, + # To increase learning speed with multiple learners, + # increase the learning rate correspondingly. + lr=0.0008 * (args.num_learners or 1) ** 0.5, + ) + # Plug in our simple custom BC model from above. + .rl_module(rl_module_spec=RLModuleSpec(module_class=MyBCModel)) + # Run evaluation to observe how good our BC policy already is. + .evaluation( + evaluation_interval=3, + evaluation_num_env_runners=1, + evaluation_duration=5, + evaluation_parallel_to_training=True, + ) + ) + + # Run the BC experiment and stop at R=250.0 + metric_key = f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" + stop = {metric_key: 250.0} + results = run_rllib_example_script_experiment(base_config, args, stop=stop) + + # Extract the RLModule checkpoint. + best_result = results.get_best_result(metric_key) + rl_module_checkpoint = ( + Path(best_result.checkpoint.path) + / COMPONENT_LEARNER_GROUP + / COMPONENT_LEARNER + / COMPONENT_RL_MODULE + / "default_policy" + ) + + # Create a new PPO config. + base_config = ( + PPOConfig() + .environment(args.env) + .training( + # Keep lr relatively low at the beginning to avoid catastrophic forgetting. + lr=0.00002, + num_epochs=6, + vf_loss_coeff=0.01, + ) + # Plug in our simple custom PPO model from above. Note that the checkpoint + # for the BC model is loadable into the PPO model, b/c the BC model is a subset + # of the PPO model (all weights/biases in the BC model are also found in the PPO + # model; the PPO model only has an additional value function branch). + .rl_module( + rl_module_spec=RLModuleSpec( + module_class=MyPPOModel, + load_state_path=rl_module_checkpoint, + ) + ) + ) + + # Quick test, whether initial performance in the loaded (now PPO) model is ok. + ppo = base_config.build() + eval_results = ppo.evaluate() + R = eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + assert R >= 200.0, f"Initial PPO performance bad! R={R} (expected 200.0+)." + print(f"PPO return after initialization: {R}") + # Check, whether training 2 times causes catastrophic forgetting. + ppo.train() + train_results = ppo.train() + R = train_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + assert R >= 250.0, f"PPO performance (training) bad! R={R} (expected 250.0+)." + print(f"PPO return after 2x training: {R}") + + # Perform actual PPO training run (this time until 450.0 return). + stop = { + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 450.0, + } + run_rllib_example_script_experiment(base_config, args, stop=stop)