koichi12 commited on
Commit
c84597e
·
verified ·
1 Parent(s): fb8b131

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc +0 -0
  2. .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc +0 -0
  3. .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc +0 -0
  4. .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc +0 -0
  5. .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py +0 -0
  8. .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py +109 -0
  11. .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py +86 -0
  12. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py +0 -0
  13. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc +0 -0
  14. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc +0 -0
  15. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc +0 -0
  16. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc +0 -0
  17. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc +0 -0
  18. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc +0 -0
  19. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc +0 -0
  20. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py +0 -0
  21. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc +0 -0
  22. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc +0 -0
  23. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc +0 -0
  24. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc +0 -0
  25. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py +92 -0
  26. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py +122 -0
  27. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py +80 -0
  28. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py +14 -0
  29. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py +14 -0
  30. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py +154 -0
  31. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py +228 -0
  32. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py +198 -0
  33. .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py +164 -0
  34. .venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py +230 -0
  35. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py +0 -0
  36. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc +0 -0
  37. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc +0 -0
  38. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc +0 -0
  39. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc +0 -0
  41. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py +250 -0
  42. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py +119 -0
  43. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py +85 -0
  44. .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py +170 -0
  45. .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py +0 -0
  46. .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc +0 -0
  47. .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py +0 -0
  48. .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc +0 -0
  49. .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc +0 -0
  50. .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (191 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc ADDED
Binary file (14.7 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc ADDED
Binary file (6.98 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc ADDED
Binary file (5.44 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc ADDED
Binary file (3.21 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc ADDED
Binary file (6.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (199 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc ADDED
Binary file (4.83 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example on how to define and run with an RLModule with a dependent action space.
2
+
3
+ This examples:
4
+ - Shows how to write a custom RLModule outputting autoregressive actions.
5
+ The RLModule class used here implements a prior distribution for the first couple
6
+ of actions and then uses the sampled actions to compute the parameters for and
7
+ sample from a posterior distribution.
8
+ - Shows how to configure a PPO algorithm to use the custom RLModule.
9
+ - Stops the training after 100k steps or when the mean episode return
10
+ exceeds -0.012 in evaluation, i.e. if the agent has learned to
11
+ synchronize its actions.
12
+
13
+ For details on the environment used, take a look at the `CorrelatedActionsEnv`
14
+ class. To receive an episode return over 100, the agent must learn how to synchronize
15
+ its actions.
16
+
17
+
18
+ How to run this script
19
+ ----------------------
20
+ `python [script file name].py --enable-new-api-stack --num-env-runners 2`
21
+
22
+ Control the number of `EnvRunner`s with the `--num-env-runners` flag. This
23
+ will increase the sampling speed.
24
+
25
+ For debugging, use the following additional command line options
26
+ `--no-tune --num-env-runners=0`
27
+ which should allow you to set breakpoints anywhere in the RLlib code and
28
+ have the execution stop there for inspection and debugging.
29
+
30
+ For logging to your WandB account, use:
31
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
32
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
33
+
34
+
35
+ Results to expect
36
+ -----------------
37
+ You should reach an episode return of better than -0.5 quickly through a simple PPO
38
+ policy. The logic behind beating the env is roughly:
39
+
40
+ OBS: optimal a1: r1: optimal a2: r2:
41
+ -1 2 0 -1.0 0
42
+ -0.5 1/2 -0.5 -0.5/-1.5 0
43
+ 0 1 0 -1.0 0
44
+ 0.5 0/1 -0.5 -0.5/-1.5 0
45
+ 1 0 0 -1.0 0
46
+
47
+ Meaning, most of the time, you would receive a reward better than -0.5, but worse than
48
+ 0.0.
49
+
50
+ +--------------------------------------+------------+--------+------------------+
51
+ | Trial name | status | iter | total time (s) |
52
+ | | | | |
53
+ |--------------------------------------+------------+--------+------------------+
54
+ | PPO_CorrelatedActionsEnv_6660d_00000 | TERMINATED | 76 | 132.438 |
55
+ +--------------------------------------+------------+--------+------------------+
56
+ +------------------------+------------------------+------------------------+
57
+ | episode_return_mean | num_env_steps_sample | ...env_steps_sampled |
58
+ | | d_lifetime | _lifetime_throughput |
59
+ |------------------------+------------------------+------------------------|
60
+ | -0.43 | 152000 | 1283.48 |
61
+ +------------------------+------------------------+------------------------+
62
+ """
63
+
64
+ from ray.rllib.algorithms.ppo import PPOConfig
65
+ from ray.rllib.core.rl_module.rl_module import RLModuleSpec
66
+ from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv
67
+ from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import (
68
+ AutoregressiveActionsRLM,
69
+ )
70
+ from ray.rllib.utils.test_utils import (
71
+ add_rllib_example_script_args,
72
+ run_rllib_example_script_experiment,
73
+ )
74
+
75
+
76
+ parser = add_rllib_example_script_args(
77
+ default_iters=1000,
78
+ default_timesteps=2000000,
79
+ default_reward=-0.45,
80
+ )
81
+ parser.set_defaults(enable_new_api_stack=True)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ args = parser.parse_args()
86
+
87
+ if args.algo != "PPO":
88
+ raise ValueError(
89
+ "This example script only runs with PPO! Set --algo=PPO on the command "
90
+ "line."
91
+ )
92
+
93
+ base_config = (
94
+ PPOConfig()
95
+ .environment(CorrelatedActionsEnv)
96
+ .training(
97
+ train_batch_size_per_learner=2000,
98
+ num_epochs=12,
99
+ minibatch_size=256,
100
+ entropy_coeff=0.005,
101
+ lr=0.0003,
102
+ )
103
+ # Specify the RLModule class to be used.
104
+ .rl_module(
105
+ rl_module_spec=RLModuleSpec(module_class=AutoregressiveActionsRLM),
106
+ )
107
+ )
108
+
109
+ run_rllib_example_script_experiment(base_config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete
2
+
3
+ from ray.tune.registry import register_env
4
+ from ray.rllib.connectors.env_to_module import FlattenObservations
5
+ from ray.rllib.examples.envs.classes.multi_agent import (
6
+ MultiAgentNestedSpaceRepeatAfterMeEnv,
7
+ )
8
+ from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import (
9
+ NestedSpaceRepeatAfterMeEnv,
10
+ )
11
+ from ray.rllib.utils.test_utils import (
12
+ add_rllib_example_script_args,
13
+ run_rllib_example_script_experiment,
14
+ )
15
+ from ray.tune.registry import get_trainable_cls
16
+
17
+
18
+ # Read in common example script command line arguments.
19
+ parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0)
20
+ parser.set_defaults(enable_new_api_stack=True)
21
+
22
+
23
+ if __name__ == "__main__":
24
+ args = parser.parse_args()
25
+
26
+ # Define env-to-module-connector pipeline for the new stack.
27
+ def _env_to_module_pipeline(env):
28
+ return FlattenObservations(multi_agent=args.num_agents > 0)
29
+
30
+ # Register our environment with tune.
31
+ if args.num_agents > 0:
32
+ register_env(
33
+ "env",
34
+ lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv(
35
+ config=dict(c, **{"num_agents": args.num_agents})
36
+ ),
37
+ )
38
+ else:
39
+ register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c))
40
+
41
+ # Define the AlgorithmConfig used.
42
+ base_config = (
43
+ get_trainable_cls(args.algo)
44
+ .get_default_config()
45
+ .environment(
46
+ "env",
47
+ env_config={
48
+ "space": Dict(
49
+ {
50
+ "a": Tuple(
51
+ [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})]
52
+ ),
53
+ "b": Box(-10.0, 10.0, (2,)),
54
+ "c": MultiDiscrete([3, 3]),
55
+ "d": Discrete(2),
56
+ }
57
+ ),
58
+ "episode_len": 100,
59
+ },
60
+ )
61
+ .env_runners(env_to_module_connector=_env_to_module_pipeline)
62
+ # No history in Env (bandit problem).
63
+ .training(
64
+ gamma=0.0,
65
+ lr=0.0005,
66
+ )
67
+ )
68
+
69
+ # Add a simple multi-agent setup.
70
+ if args.num_agents > 0:
71
+ base_config.multi_agent(
72
+ policies={f"p{i}" for i in range(args.num_agents)},
73
+ policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
74
+ )
75
+
76
+ # Fix some PPO-specific settings.
77
+ if args.algo == "PPO":
78
+ base_config.training(
79
+ # We don't want high entropy in this Env.
80
+ entropy_coeff=0.00005,
81
+ num_epochs=4,
82
+ vf_loss_coeff=0.01,
83
+ )
84
+
85
+ # Run everything as configured.
86
+ run_rllib_example_script_experiment(base_config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (202 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc ADDED
Binary file (719 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc ADDED
Binary file (745 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc ADDED
Binary file (7.3 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc ADDED
Binary file (9.67 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc ADDED
Binary file (8.85 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc ADDED
Binary file (8.54 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (210 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc ADDED
Binary file (4.54 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc ADDED
Binary file (5.88 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc ADDED
Binary file (3.86 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ from typing import Any, List, Optional
3
+
4
+ import gymnasium as gym
5
+
6
+ from ray.rllib.connectors.connector_v2 import ConnectorV2
7
+ from ray.rllib.core.rl_module.rl_module import RLModule
8
+ from ray.rllib.utils.typing import EpisodeType
9
+
10
+
11
+ class CountBasedCuriosity(ConnectorV2):
12
+ """Learner ConnectorV2 piece to compute intrinsic rewards based on obs counts.
13
+
14
+ Add this connector piece to your Learner pipeline, through your algo config:
15
+ ```
16
+ config.training(
17
+ learner_connector=lambda obs_sp, act_sp: CountBasedCuriosity()
18
+ )
19
+ ```
20
+
21
+ Intrinsic rewards are computed on the Learner side based on naive observation
22
+ counts, which is why this connector should only be used for simple environments
23
+ with a reasonable number of possible observations. The intrinsic reward for a given
24
+ timestep is:
25
+ r(i) = intrinsic_reward_coeff * (1 / C(obs(i)))
26
+ where C is the total (lifetime) count of the obs at timestep i.
27
+
28
+ The intrinsic reward is added to the extrinsic reward and saved back into the
29
+ episode (under the main "rewards" key).
30
+
31
+ Note that the computation and saving back to the episode all happens before the
32
+ actual train batch is generated from the episode data. Thus, the Learner and the
33
+ RLModule used do not take notice of the extra reward added.
34
+
35
+ If you would like to use a more sophisticated mechanism for intrinsic reward
36
+ computations, take a look at the `EuclidianDistanceBasedCuriosity` connector piece
37
+ at `ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity`
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ input_observation_space: Optional[gym.Space] = None,
43
+ input_action_space: Optional[gym.Space] = None,
44
+ *,
45
+ intrinsic_reward_coeff: float = 1.0,
46
+ **kwargs,
47
+ ):
48
+ """Initializes a CountBasedCuriosity instance.
49
+
50
+ Args:
51
+ intrinsic_reward_coeff: The weight with which to multiply the intrinsic
52
+ reward before adding (and saving) it back to the main (extrinsic)
53
+ reward of the episode at each timestep.
54
+ """
55
+ super().__init__(input_observation_space, input_action_space)
56
+
57
+ # Naive observation counter.
58
+ self._counts = Counter()
59
+ self.intrinsic_reward_coeff = intrinsic_reward_coeff
60
+
61
+ def __call__(
62
+ self,
63
+ *,
64
+ rl_module: RLModule,
65
+ batch: Any,
66
+ episodes: List[EpisodeType],
67
+ explore: Optional[bool] = None,
68
+ shared_data: Optional[dict] = None,
69
+ **kwargs,
70
+ ) -> Any:
71
+ # Loop through all episodes and change the reward to
72
+ # [reward + intrinsic reward]
73
+ for sa_episode in self.single_agent_episode_iterator(
74
+ episodes=episodes, agents_that_stepped_only=False
75
+ ):
76
+ # Loop through all obs, except the last one.
77
+ observations = sa_episode.get_observations(slice(None, -1))
78
+ # Get all respective (extrinsic) rewards.
79
+ rewards = sa_episode.get_rewards()
80
+
81
+ for i, (obs, rew) in enumerate(zip(observations, rewards)):
82
+ obs = tuple(obs)
83
+ # Add 1 to obs counter.
84
+ self._counts[obs] += 1
85
+ # Compute our count-based intrinsic reward and add it to the main
86
+ # (extrinsic) reward.
87
+ rew += self.intrinsic_reward_coeff * (1 / self._counts[obs])
88
+ # Store the new reward back to the episode (under the correct
89
+ # timestep/index).
90
+ sa_episode.set_rewards(new_data=rew, at_indices=i)
91
+
92
+ return batch
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+ from typing import Any, List, Optional
3
+
4
+ import gymnasium as gym
5
+ import numpy as np
6
+
7
+ from ray.rllib.connectors.connector_v2 import ConnectorV2
8
+ from ray.rllib.core.rl_module.rl_module import RLModule
9
+ from ray.rllib.utils.typing import EpisodeType
10
+
11
+
12
+ class EuclidianDistanceBasedCuriosity(ConnectorV2):
13
+ """Learner ConnectorV2 piece computing intrinsic rewards with euclidian distance.
14
+
15
+ Add this connector piece to your Learner pipeline, through your algo config:
16
+ ```
17
+ config.training(
18
+ learner_connector=lambda obs_sp, act_sp: EuclidianDistanceBasedCuriosity()
19
+ )
20
+ ```
21
+
22
+ Intrinsic rewards are computed on the Learner side based on comparing the euclidian
23
+ distance of observations vs already seen ones. A configurable number of observations
24
+ will be stored in a FIFO buffer and all incoming observations have their distance
25
+ measured against those.
26
+
27
+ The minimum distance measured is the intrinsic reward for the incoming obs
28
+ (multiplied by a fixed coeffieicnt and added to the "main" extrinsic reward):
29
+ r(i) = intrinsic_reward_coeff * min(ED(o, o(i)) for o in stored_obs))
30
+ where `ED` is the euclidian distance and `stored_obs` is the buffer.
31
+
32
+ The intrinsic reward is then added to the extrinsic reward and saved back into the
33
+ episode (under the main "rewards" key).
34
+
35
+ Note that the computation and saving back to the episode all happens before the
36
+ actual train batch is generated from the episode data. Thus, the Learner and the
37
+ RLModule used do not take notice of the extra reward added.
38
+
39
+ Only one observation per incoming episode will be stored as a new one in the buffer.
40
+ Thereby, we pick the observation with the largest `min(ED)` value over all already
41
+ stored observations to be stored per episode.
42
+
43
+ If you would like to use a simpler, count-based mechanism for intrinsic reward
44
+ computations, take a look at the `CountBasedCuriosity` connector piece
45
+ at `ray.rllib.examples.connectors.classes.count_based_curiosity`
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ input_observation_space: Optional[gym.Space] = None,
51
+ input_action_space: Optional[gym.Space] = None,
52
+ *,
53
+ intrinsic_reward_coeff: float = 1.0,
54
+ max_buffer_size: int = 100,
55
+ **kwargs,
56
+ ):
57
+ """Initializes a CountBasedCuriosity instance.
58
+
59
+ Args:
60
+ intrinsic_reward_coeff: The weight with which to multiply the intrinsic
61
+ reward before adding (and saving) it back to the main (extrinsic)
62
+ reward of the episode at each timestep.
63
+ """
64
+ super().__init__(input_observation_space, input_action_space)
65
+
66
+ # Create an observation buffer
67
+ self.obs_buffer = deque(maxlen=max_buffer_size)
68
+ self.intrinsic_reward_coeff = intrinsic_reward_coeff
69
+
70
+ self._test = 0
71
+
72
+ def __call__(
73
+ self,
74
+ *,
75
+ rl_module: RLModule,
76
+ batch: Any,
77
+ episodes: List[EpisodeType],
78
+ explore: Optional[bool] = None,
79
+ shared_data: Optional[dict] = None,
80
+ **kwargs,
81
+ ) -> Any:
82
+ if self._test > 10:
83
+ return batch
84
+ self._test += 1
85
+ # Loop through all episodes and change the reward to
86
+ # [reward + intrinsic reward]
87
+ for sa_episode in self.single_agent_episode_iterator(
88
+ episodes=episodes, agents_that_stepped_only=False
89
+ ):
90
+ # Loop through all obs, except the last one.
91
+ observations = sa_episode.get_observations(slice(None, -1))
92
+ # Get all respective (extrinsic) rewards.
93
+ rewards = sa_episode.get_rewards()
94
+
95
+ max_dist_obs = None
96
+ max_dist = float("-inf")
97
+ for i, (obs, rew) in enumerate(zip(observations, rewards)):
98
+ # Compare obs to all stored observations and compute euclidian distance.
99
+ min_dist = 0.0
100
+ if self.obs_buffer:
101
+ min_dist = min(
102
+ np.sqrt(np.sum((obs - stored_obs) ** 2))
103
+ for stored_obs in self.obs_buffer
104
+ )
105
+ if min_dist > max_dist:
106
+ max_dist = min_dist
107
+ max_dist_obs = obs
108
+
109
+ # Compute our euclidian distance-based intrinsic reward and add it to
110
+ # the main (extrinsic) reward.
111
+ rew += self.intrinsic_reward_coeff * min_dist
112
+ # Store the new reward back to the episode (under the correct
113
+ # timestep/index).
114
+ sa_episode.set_rewards(new_data=rew, at_indices=i)
115
+
116
+ # Add the one observation of this episode with the largest (min) euclidian
117
+ # dist to all already stored obs to the buffer (maybe throwing out the
118
+ # oldest obs in there).
119
+ if max_dist_obs is not None:
120
+ self.obs_buffer.append(max_dist_obs)
121
+
122
+ return batch
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, List, Optional
2
+
3
+ import gymnasium as gym
4
+ import numpy as np
5
+
6
+ from ray.rllib.connectors.connector_v2 import ConnectorV2
7
+ from ray.rllib.core.rl_module.rl_module import RLModule
8
+ from ray.rllib.examples.envs.classes.utils.cartpole_observations_proto import (
9
+ CartPoleObservation,
10
+ )
11
+ from ray.rllib.utils.annotations import override
12
+ from ray.rllib.utils.typing import EpisodeType
13
+
14
+
15
+ class ProtobufCartPoleObservationDecoder(ConnectorV2):
16
+ """Env-to-module ConnectorV2 piece decoding protobuf obs into CartPole-v1 obs.
17
+
18
+ Add this connector piece to your env-to-module pipeline, through your algo config:
19
+ ```
20
+ config.env_runners(
21
+ env_to_module_connector=lambda env: ProtobufCartPoleObservationDecoder()
22
+ )
23
+ ```
24
+
25
+ The incoming observation space must be a 1D Box of dtype uint8
26
+ (which is the same as a binary string). The outgoing observation space is the
27
+ normal CartPole-v1 1D space: Box(-inf, inf, (4,), float32).
28
+ """
29
+
30
+ @override(ConnectorV2)
31
+ def recompute_output_observation_space(
32
+ self,
33
+ input_observation_space: gym.Space,
34
+ input_action_space: gym.Space,
35
+ ) -> gym.Space:
36
+ # Make sure the incoming observation space is a protobuf (binary string).
37
+ assert (
38
+ isinstance(input_observation_space, gym.spaces.Box)
39
+ and len(input_observation_space.shape) == 1
40
+ and input_observation_space.dtype.name == "uint8"
41
+ )
42
+ # Return CartPole-v1's natural observation space.
43
+ return gym.spaces.Box(float("-inf"), float("inf"), (4,), np.float32)
44
+
45
+ def __call__(
46
+ self,
47
+ *,
48
+ rl_module: RLModule,
49
+ batch: Any,
50
+ episodes: List[EpisodeType],
51
+ explore: Optional[bool] = None,
52
+ shared_data: Optional[dict] = None,
53
+ **kwargs,
54
+ ) -> Any:
55
+ # Loop through all episodes and change the observation from a binary string
56
+ # to an actual 1D np.ndarray (normal CartPole-v1 obs).
57
+ for sa_episode in self.single_agent_episode_iterator(episodes=episodes):
58
+ # Get last obs (binary string).
59
+ obs = sa_episode.get_observations(-1)
60
+ obs_bytes = obs.tobytes()
61
+ obs_protobuf = CartPoleObservation()
62
+ obs_protobuf.ParseFromString(obs_bytes)
63
+
64
+ # Set up the natural CartPole-v1 observation tensor from the protobuf
65
+ # values.
66
+ new_obs = np.array(
67
+ [
68
+ obs_protobuf.x_pos,
69
+ obs_protobuf.x_veloc,
70
+ obs_protobuf.angle_pos,
71
+ obs_protobuf.angle_veloc,
72
+ ],
73
+ np.float32,
74
+ )
75
+
76
+ # Write the new observation (1D tensor) back into the Episode.
77
+ sa_episode.set_observations(new_data=new_obs, at_indices=-1)
78
+
79
+ # Return `data` as-is.
80
+ return batch
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Placeholder for training with count-based curiosity.
2
+
3
+ The actual script can be found at a different location (see code below).
4
+ """
5
+
6
+ if __name__ == "__main__":
7
+ import subprocess
8
+ import sys
9
+
10
+ # Forward to "python ../curiosity/[same script name].py [same options]"
11
+ command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]
12
+
13
+ # Run the script.
14
+ subprocess.run(command, capture_output=True)
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Placeholder for training with euclidian distance-based curiosity.
2
+
3
+ The actual script can be found at a different location (see code below).
4
+ """
5
+
6
+ if __name__ == "__main__":
7
+ import subprocess
8
+ import sys
9
+
10
+ # Forward to "python ../curiosity/[same script name].py [same options]"
11
+ command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]
12
+
13
+ # Run the script.
14
+ subprocess.run(command, capture_output=True)
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example using a ConnectorV2 to flatten arbitrarily nested dict or tuple observations.
2
+
3
+ An RLlib Algorithm has 3 distinct connector pipelines:
4
+ - An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
5
+ a batch for an RLModule to compute actions (`forward_inference()` or
6
+ `forward_exploration()`).
7
+ - A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
8
+ it into an action readable by the environment.
9
+ - A learner connector pipeline on a Learner taking a list of episodes and producing
10
+ a batch for an RLModule to perform the training forward pass (`forward_train()`).
11
+
12
+ Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
13
+ adds/prepends to these pipelines in order to perform the most basic functionalities.
14
+ For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
15
+ env-to-module pipeline to make sure the batch for computing actions contains - at the
16
+ minimum - the most recent observation.
17
+
18
+ On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
19
+ pieces (or use the ones available already in RLlib) and add them to one of the 3
20
+ different pipelines described above, as required.
21
+
22
+ This example:
23
+ - shows how the `FlattenObservation` ConnectorV2 piece can be added to the
24
+ env-to-module pipeline.
25
+ - demonstrates that by using this connector, any arbitrarily nested dict or tuple
26
+ observations is properly flattened into a simple 1D tensor, for easier RLModule
27
+ processing.
28
+ - shows how - in a multi-agent setup - individual agents can be specified, whose
29
+ observations should be flattened (while other agents' observations will always
30
+ be left as-is).
31
+ - uses a variant of the CartPole-v1 environment, in which the 4 observation items
32
+ (x-pos, x-veloc, angle, and angle-veloc) are taken apart and put into a nested dict
33
+ with the structure:
34
+ {
35
+ "x-pos": [x-pos],
36
+ "angular-pos": {
37
+ "value": [angle],
38
+ "some_random_stuff": [random Discrete(3)], # <- should be ignored by algo
39
+ },
40
+ "velocs": Tuple([x-veloc], [angle-veloc]),
41
+ }
42
+
43
+
44
+ How to run this script
45
+ ----------------------
46
+ `python [script file name].py --enable-new-api-stack`
47
+
48
+ For debugging, use the following additional command line options
49
+ `--no-tune --num-env-runners=0`
50
+ which should allow you to set breakpoints anywhere in the RLlib code and
51
+ have the execution stop there for inspection and debugging.
52
+
53
+ For logging to your WandB account, use:
54
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
55
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
56
+
57
+
58
+ Results to expect
59
+ -----------------
60
+
61
+ +---------------------+------------+----------------+--------+------------------+
62
+ | Trial name | status | loc | iter | total time (s) |
63
+ | | | | | |
64
+ |---------------------+------------+----------------+--------+------------------+
65
+ | PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 | 25 | 24.1426 |
66
+ +---------------------+------------+----------------+--------+------------------+
67
+ +------------------------+------------------------+------------------------+
68
+ | num_env_steps_sample | num_env_steps_traine | episode_return_mean |
69
+ | d_lifetime | d_lifetime | |
70
+ +------------------------+------------------------+------------------------|
71
+ | 100000 | 100000 | 421.42 |
72
+ +------------------------+------------------------+------------------------+
73
+ """
74
+ from ray.tune.registry import register_env
75
+ from ray.rllib.connectors.env_to_module import FlattenObservations
76
+ from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
77
+ from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import (
78
+ CartPoleWithDictObservationSpace,
79
+ )
80
+ from ray.rllib.examples.envs.classes.multi_agent import (
81
+ MultiAgentCartPoleWithDictObservationSpace,
82
+ )
83
+ from ray.rllib.utils.test_utils import (
84
+ add_rllib_example_script_args,
85
+ run_rllib_example_script_experiment,
86
+ )
87
+ from ray.tune.registry import get_trainable_cls
88
+
89
+
90
+ # Read in common example script command line arguments.
91
+ parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0)
92
+ parser.set_defaults(enable_new_api_stack=True)
93
+
94
+
95
+ if __name__ == "__main__":
96
+ args = parser.parse_args()
97
+
98
+ # Define env-to-module-connector pipeline for the new stack.
99
+ def _env_to_module_pipeline(env):
100
+ return FlattenObservations(multi_agent=args.num_agents > 0)
101
+
102
+ # Register our environment with tune.
103
+ if args.num_agents > 0:
104
+ register_env(
105
+ "env",
106
+ lambda _: MultiAgentCartPoleWithDictObservationSpace(
107
+ config={"num_agents": args.num_agents}
108
+ ),
109
+ )
110
+ else:
111
+ register_env("env", lambda _: CartPoleWithDictObservationSpace())
112
+
113
+ # Define the AlgorithmConfig used.
114
+ base_config = (
115
+ get_trainable_cls(args.algo)
116
+ .get_default_config()
117
+ .environment("env")
118
+ .env_runners(env_to_module_connector=_env_to_module_pipeline)
119
+ .training(
120
+ gamma=0.99,
121
+ lr=0.0003,
122
+ )
123
+ .rl_module(
124
+ model_config=DefaultModelConfig(
125
+ fcnet_hiddens=[32],
126
+ fcnet_activation="linear",
127
+ vf_share_layers=True,
128
+ ),
129
+ )
130
+ )
131
+
132
+ # Add a simple multi-agent setup.
133
+ if args.num_agents > 0:
134
+ base_config.multi_agent(
135
+ policies={f"p{i}" for i in range(args.num_agents)},
136
+ policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
137
+ )
138
+
139
+ # PPO-specific settings (for better learning behavior only).
140
+ if args.algo == "PPO":
141
+ base_config.training(
142
+ num_epochs=6,
143
+ vf_loss_coeff=0.01,
144
+ )
145
+ # IMPALA-specific settings (for better learning behavior only).
146
+ elif args.algo == "IMPALA":
147
+ base_config.training(
148
+ lr=0.0005,
149
+ vf_loss_coeff=0.05,
150
+ entropy_coeff=0.0,
151
+ )
152
+
153
+ # Run everything as configured.
154
+ run_rllib_example_script_experiment(base_config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example using 2 ConnectorV2 for observation frame-stacking in Atari environments.
2
+
3
+ An RLlib Algorithm has 3 distinct connector pipelines:
4
+ - An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
5
+ a batch for an RLModule to compute actions (`forward_inference()` or
6
+ `forward_exploration()`).
7
+ - A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
8
+ it into an action readable by the environment.
9
+ - A learner connector pipeline on a Learner taking a list of episodes and producing
10
+ a batch for an RLModule to perform the training forward pass (`forward_train()`).
11
+
12
+ Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
13
+ adds/prepends to these pipelines in order to perform the most basic functionalities.
14
+ For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
15
+ env-to-module pipeline to make sure the batch for computing actions contains - at the
16
+ minimum - the most recent observation.
17
+
18
+ On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
19
+ pieces (or use the ones available already in RLlib) and add them to one of the 3
20
+ different pipelines described above, as required.
21
+
22
+ This example:
23
+ - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
24
+ env-to-module pipeline.
25
+ - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
26
+ learner connector pipeline.
27
+ - demonstrates that using these two pieces (rather than performing framestacking
28
+ already inside the environment using a gymnasium wrapper) increases overall
29
+ performance by about 5%.
30
+
31
+
32
+ How to run this script
33
+ ----------------------
34
+ `python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
35
+
36
+ Use the `--num-frames` option to define the number of observations to framestack.
37
+ If you don't want to use Connectors to perform the framestacking, set the
38
+ `--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
39
+ gymnasium observation wrapper. In this case though, be aware that the tensors being
40
+ sent through the network are `--num-frames` x larger than if you use the Connector
41
+ setup.
42
+
43
+ For debugging, use the following additional command line options
44
+ `--no-tune --num-env-runners=0`
45
+ which should allow you to set breakpoints anywhere in the RLlib code and
46
+ have the execution stop there for inspection and debugging.
47
+
48
+ For logging to your WandB account, use:
49
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
50
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
51
+
52
+
53
+ Results to expect
54
+ -----------------
55
+
56
+ With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
57
+ and learner connector pipelines), you should see something like this using:
58
+ `--env ALE/Pong-v5 --num-learners=4 --num-gpus-per-learner=1 --num-env-runners=95`
59
+ +---------------------------+------------+--------+------------------+...
60
+ | Trial name | status | iter | total time (s) |
61
+ | | | | |
62
+ |---------------------------+------------+--------+------------------+...
63
+ | PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 335.837 |
64
+ +---------------------------+------------+--------+------------------+...
65
+
66
+ Note that the time to run these 200 iterations is about ~5% faster than when
67
+ performing framestacking already inside the environment (using a
68
+ `gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
69
+ needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
70
+
71
+ Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal),
72
+ the output looks like this:
73
+ +---------------------------+------------+--------+------------------+...
74
+ | Trial name | status | iter | total time (s) |
75
+ | | | | |
76
+ |---------------------------+------------+--------+------------------+...
77
+ | PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 351.505 |
78
+ +---------------------------+------------+--------+------------------+...
79
+ """
80
+ import gymnasium as gym
81
+
82
+ from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
83
+ from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
84
+ from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
85
+ from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
86
+ from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent
87
+ from ray.rllib.utils.test_utils import (
88
+ add_rllib_example_script_args,
89
+ run_rllib_example_script_experiment,
90
+ )
91
+ from ray.tune.registry import get_trainable_cls
92
+
93
+ # Read in common example script command line arguments.
94
+ parser = add_rllib_example_script_args(
95
+ default_timesteps=5000000, default_reward=20.0, default_iters=200
96
+ )
97
+ # Use Pong by default.
98
+ parser.set_defaults(
99
+ enable_new_api_stack=True,
100
+ env="ale_py:ALE/Pong-v5",
101
+ )
102
+ parser.add_argument(
103
+ "--num-frames",
104
+ type=int,
105
+ default=4,
106
+ help="The number of observation frames to stack.",
107
+ )
108
+ parser.add_argument(
109
+ "--use-gym-wrapper-framestacking",
110
+ action="store_true",
111
+ help="Whether to use RLlib's Atari wrapper's framestacking capabilities (as "
112
+ "opposed to doing it via a specific ConenctorV2 pipeline).",
113
+ )
114
+
115
+
116
+ if __name__ == "__main__":
117
+ from ray import tune
118
+
119
+ args = parser.parse_args()
120
+
121
+ # Define our custom connector pipelines.
122
+ def _make_env_to_module_connector(env):
123
+ # Create the env-to-module connector. We return an individual connector piece
124
+ # here, which RLlib automatically integrates into a pipeline (and
125
+ # add its default connector piece to the end of that pipeline).
126
+ # The default pipeline automatically fixes the input- and output spaces of the
127
+ # individual connector pieces in it.
128
+ # Note that since the frame stacking connector does NOT write information
129
+ # back to the episode (in order to save memory and network traffic), we
130
+ # also need to perform the same procedure on the Learner end (see below
131
+ # where we set up the Learner pipeline).
132
+ return FrameStackingEnvToModule(
133
+ num_frames=args.num_frames,
134
+ multi_agent=args.num_agents > 0,
135
+ )
136
+
137
+ def _make_learner_connector(input_observation_space, input_action_space):
138
+ # Create the learner connector.
139
+ return FrameStackingLearner(
140
+ num_frames=args.num_frames,
141
+ multi_agent=args.num_agents > 0,
142
+ )
143
+
144
+ # Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it).
145
+ # We would like our frame stacking connector to do this job.
146
+ def _env_creator(cfg):
147
+ return wrap_atari_for_new_api_stack(
148
+ gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}),
149
+ # Perform framestacking either through ConnectorV2 or right here through
150
+ # the observation wrapper.
151
+ framestack=(
152
+ args.num_frames if args.use_gym_wrapper_framestacking else None
153
+ ),
154
+ )
155
+
156
+ if args.num_agents > 0:
157
+ tune.register_env(
158
+ "atari-env",
159
+ lambda cfg: make_multi_agent(_env_creator)(
160
+ dict(cfg, **{"num_agents": args.num_agents})
161
+ ),
162
+ )
163
+ else:
164
+ tune.register_env("atari-env", _env_creator)
165
+
166
+ base_config = (
167
+ get_trainable_cls(args.algo)
168
+ .get_default_config()
169
+ .environment(
170
+ "atari-env",
171
+ env_config={
172
+ # Make analogous to old v4 + NoFrameskip.
173
+ "frameskip": 1,
174
+ "full_action_space": False,
175
+ "repeat_action_probability": 0.0,
176
+ },
177
+ clip_rewards=True,
178
+ )
179
+ .env_runners(
180
+ # ... new EnvRunner and our frame stacking env-to-module connector.
181
+ env_to_module_connector=(
182
+ None
183
+ if args.use_gym_wrapper_framestacking
184
+ else _make_env_to_module_connector
185
+ ),
186
+ num_envs_per_env_runner=1 if args.num_agents > 0 else 2,
187
+ )
188
+ .training(
189
+ # Use our frame stacking learner connector.
190
+ learner_connector=(
191
+ None if args.use_gym_wrapper_framestacking else _make_learner_connector
192
+ ),
193
+ entropy_coeff=0.01,
194
+ # Linearly adjust learning rate based on number of GPUs.
195
+ lr=0.00015 * (args.num_learners or 1),
196
+ grad_clip=100.0,
197
+ grad_clip_by="global_norm",
198
+ )
199
+ .rl_module(
200
+ model_config=DefaultModelConfig(
201
+ vf_share_layers=True,
202
+ conv_filters=[(16, 4, 2), (32, 4, 2), (64, 4, 2), (128, 4, 2)],
203
+ conv_activation="relu",
204
+ head_fcnet_hiddens=[256],
205
+ ),
206
+ )
207
+ )
208
+
209
+ # PPO specific settings.
210
+ if args.algo == "PPO":
211
+ base_config.training(
212
+ num_epochs=10,
213
+ minibatch_size=64,
214
+ lambda_=0.95,
215
+ kl_coeff=0.5,
216
+ clip_param=0.1,
217
+ vf_clip_param=10.0,
218
+ )
219
+
220
+ # Add a simple multi-agent setup.
221
+ if args.num_agents > 0:
222
+ base_config.multi_agent(
223
+ policies={f"p{i}" for i in range(args.num_agents)},
224
+ policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
225
+ )
226
+
227
+ # Run everything as configured.
228
+ run_rllib_example_script_experiment(base_config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example using a ConnectorV2 for processing observations with a mean/std filter.
2
+
3
+ An RLlib Algorithm has 3 distinct connector pipelines:
4
+ - An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
5
+ a batch for an RLModule to compute actions (`forward_inference()` or
6
+ `forward_exploration()`).
7
+ - A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
8
+ it into an action readable by the environment.
9
+ - A learner connector pipeline on a Learner taking a list of episodes and producing
10
+ a batch for an RLModule to perform the training forward pass (`forward_train()`).
11
+
12
+ Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
13
+ adds/prepends to these pipelines in order to perform the most basic functionalities.
14
+ For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
15
+ env-to-module pipeline to make sure the batch for computing actions contains - at the
16
+ minimum - the most recent observation.
17
+
18
+ On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
19
+ pieces (or use the ones available already in RLlib) and add them to one of the 3
20
+ different pipelines described above, as required.
21
+
22
+ This example:
23
+ - shows how the `MeanStdFilter` ConnectorV2 piece can be added to the env-to-module
24
+ pipeline.
25
+ - demonstrates that using such a filter enhances learning behavior (or even makes
26
+ if possible to learn overall) in some environments, especially those with lopsided
27
+ observation spaces, for example `Box(-3000, -1000, ...)`.
28
+
29
+
30
+ How to run this script
31
+ ----------------------
32
+ `python [script file name].py --enable-new-api-stack`
33
+
34
+ For debugging, use the following additional command line options
35
+ `--no-tune --num-env-runners=0`
36
+ which should allow you to set breakpoints anywhere in the RLlib code and
37
+ have the execution stop there for inspection and debugging.
38
+
39
+ For logging to your WandB account, use:
40
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
41
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
42
+
43
+
44
+ Results to expect
45
+ -----------------
46
+ Running this example with the mean-std filter results in the normally expected Pendulum
47
+ learning behavior:
48
+ +-------------------------------+------------+-----------------+--------+
49
+ | Trial name | status | loc | iter |
50
+ | | | | |
51
+ |-------------------------------+------------+-----------------+--------+
52
+ | PPO_lopsided-pend_f9c96_00000 | TERMINATED | 127.0.0.1:43612 | 77 |
53
+ +-------------------------------+------------+-----------------+--------+
54
+ +------------------+------------------------+-----------------------+
55
+ | total time (s) | num_env_steps_sample | episode_return_mean |
56
+ | | d_lifetime | |
57
+ |------------------+------------------------+-----------------------|
58
+ | 30.7466 | 40040 | -276.3 |
59
+ +------------------+------------------------+-----------------------+
60
+
61
+ If you try using the `--disable-mean-std-filter` (all other things being equal), you
62
+ will either see no learning progress at all (or a very slow one), but more likely some
63
+ numerical instability related error will be thrown:
64
+
65
+ ValueError: Expected parameter loc (Tensor of shape (64, 1)) of distribution
66
+ Normal(loc: torch.Size([64, 1]), scale: torch.Size([64, 1])) to satisfy the
67
+ constraint Real(), but found invalid values:
68
+ tensor([[nan],
69
+ [nan],
70
+ [nan],
71
+ ...
72
+ """
73
+ import gymnasium as gym
74
+ import numpy as np
75
+
76
+ from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter
77
+ from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
78
+ from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
79
+ from ray.rllib.utils.framework import try_import_torch
80
+ from ray.rllib.utils.test_utils import (
81
+ add_rllib_example_script_args,
82
+ run_rllib_example_script_experiment,
83
+ )
84
+ from ray.tune.registry import get_trainable_cls, register_env
85
+
86
+ torch, _ = try_import_torch()
87
+
88
+ parser = add_rllib_example_script_args(
89
+ default_iters=500,
90
+ default_timesteps=500000,
91
+ default_reward=-300.0,
92
+ )
93
+ parser.add_argument(
94
+ "--disable-mean-std-filter",
95
+ action="store_true",
96
+ help="Run w/o a mean/std env-to-module connector piece (filter).",
97
+ )
98
+
99
+
100
+ class LopsidedObs(gym.ObservationWrapper):
101
+ def __init__(self, env):
102
+ super().__init__(env)
103
+ self.observation_space = gym.spaces.Box(-4000.0, -1456.0, (3,), np.float32)
104
+
105
+ def observation(self, observation):
106
+ # Lopside [-1.0, 1.0] Pendulum observations
107
+ return ((observation + 1.0) / 2.0) * (4000.0 - 1456.0) - 4000.0
108
+
109
+
110
+ if __name__ == "__main__":
111
+ args = parser.parse_args()
112
+
113
+ assert (
114
+ args.enable_new_api_stack
115
+ ), "Must set --enable-new-api-stack when running this script!"
116
+
117
+ # Register our environment with tune.
118
+ if args.num_agents > 0:
119
+ register_env(
120
+ "lopsided-pend",
121
+ lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}),
122
+ )
123
+ else:
124
+ register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1")))
125
+
126
+ base_config = (
127
+ get_trainable_cls(args.algo)
128
+ .get_default_config()
129
+ .environment("lopsided-pend")
130
+ .env_runners(
131
+ # TODO (sven): MAEnvRunner does not support vectorized envs yet
132
+ # due to gym's env checkers and non-compatability with RLlib's
133
+ # MultiAgentEnv API.
134
+ num_envs_per_env_runner=1 if args.num_agents > 0 else 20,
135
+ # Define a single connector piece to be prepended to the env-to-module
136
+ # connector pipeline.
137
+ # Alternatively, return a list of n ConnectorV2 pieces (which will then be
138
+ # included in an automatically generated EnvToModulePipeline or return a
139
+ # EnvToModulePipeline directly.
140
+ env_to_module_connector=(
141
+ None
142
+ if args.disable_mean_std_filter
143
+ else lambda env: MeanStdFilter(multi_agent=args.num_agents > 0)
144
+ ),
145
+ )
146
+ .training(
147
+ train_batch_size_per_learner=512,
148
+ gamma=0.95,
149
+ # Linearly adjust learning rate based on number of GPUs.
150
+ lr=0.0003 * (args.num_learners or 1),
151
+ vf_loss_coeff=0.01,
152
+ )
153
+ .rl_module(
154
+ model_config=DefaultModelConfig(
155
+ fcnet_activation="relu",
156
+ fcnet_kernel_initializer=torch.nn.init.xavier_uniform_,
157
+ fcnet_bias_initializer=torch.nn.init.constant_,
158
+ fcnet_bias_initializer_kwargs={"val": 0.0},
159
+ ),
160
+ )
161
+ # In case you would like to run with a evaluation EnvRunners, make sure your
162
+ # `evaluation_config` key contains the `use_worker_filter_stats=False` setting
163
+ # (see below). This setting makes sure that the mean/std stats collected by the
164
+ # evaluation EnvRunners are NOT used for the training EnvRunners (unless you
165
+ # really want to mix these stats). It's normally a good idea to keep the stats
166
+ # collected during evaluation completely out of the training data (already for
167
+ # better reproducibility alone).
168
+ # .evaluation(
169
+ # evaluation_num_env_runners=1,
170
+ # evaluation_interval=1,
171
+ # evaluation_config={
172
+ # "explore": False,
173
+ # # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before
174
+ # # each round of evaluation, broadcast the latest training
175
+ # # EnvRunnerGroup's ConnectorV2 states (merged from all training remote
176
+ # # EnvRunners) to the eval EnvRunnerGroup (and discard the eval
177
+ # # EnvRunners' stats).
178
+ # "use_worker_filter_stats": False,
179
+ # },
180
+ # )
181
+ )
182
+
183
+ # PPO specific settings.
184
+ if args.algo == "PPO":
185
+ base_config.training(
186
+ minibatch_size=64,
187
+ lambda_=0.1,
188
+ vf_clip_param=10.0,
189
+ )
190
+
191
+ # Add a simple multi-agent setup.
192
+ if args.num_agents > 0:
193
+ base_config.multi_agent(
194
+ policies={f"p{i}" for i in range(args.num_agents)},
195
+ policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
196
+ )
197
+
198
+ run_rllib_example_script_experiment(base_config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example using a ConnectorV2 to add previous rewards/actions to an RLModule's input.
2
+
3
+ An RLlib Algorithm has 3 distinct connector pipelines:
4
+ - An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
5
+ a batch for an RLModule to compute actions (`forward_inference()` or
6
+ `forward_exploration()`).
7
+ - A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
8
+ it into an action readable by the environment.
9
+ - A learner connector pipeline on a Learner taking a list of episodes and producing
10
+ a batch for an RLModule to perform the training forward pass (`forward_train()`).
11
+
12
+ Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
13
+ adds/prepends to these pipelines in order to perform the most basic functionalities.
14
+ For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
15
+ env-to-module pipeline to make sure the batch for computing actions contains - at the
16
+ minimum - the most recent observation.
17
+
18
+ On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
19
+ pieces (or use the ones available already in RLlib) and add them to one of the 3
20
+ different pipelines described above, as required.
21
+
22
+ This example:
23
+ - shows how the `PrevActionsPrevRewards` ConnectorV2 piece can be added to the
24
+ env-to-module pipeline to extract previous rewards and/or actions from the ongoing
25
+ episodes.
26
+ - shows how this connector creates and wraps this new information (rewards and
27
+ actions) together with the original observations into the RLModule's input dict
28
+ under a new `gym.spaces.Dict` structure (for example, if your observation space
29
+ is `O=Box(shape=(3,))` and you add the most recent 1 reward, the new observation
30
+ space will be `Dict({"_original_obs": O, "prev_n_rewards": Box(shape=())})`.
31
+ - demonstrates how to use RLlib's `FlattenObservations` right after the
32
+ `PrevActionsPrevRewards` to flatten that new dict observation structure again into
33
+ a single 1D tensor.
34
+ - uses the StatelessCartPole environment, a CartPole-v1 derivative that's missing
35
+ both x-veloc and angle-veloc observation components and is therefore non-Markovian
36
+ (only partially observable). An LSTM default model is used for training. Adding
37
+ the additional context to the observations (for example, prev. actions) helps the
38
+ LSTM to more quickly learn in this environment.
39
+
40
+
41
+ How to run this script
42
+ ----------------------
43
+ `python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
44
+
45
+ Use the `--num-frames` option to define the number of observations to framestack.
46
+ If you don't want to use Connectors to perform the framestacking, set the
47
+ `--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
48
+ gymnasium observation wrapper. In this case though, be aware that the tensors being
49
+ sent through the network are `--num-frames` x larger than if you use the Connector
50
+ setup.
51
+
52
+ For debugging, use the following additional command line options
53
+ `--no-tune --num-env-runners=0`
54
+ which should allow you to set breakpoints anywhere in the RLlib code and
55
+ have the execution stop there for inspection and debugging.
56
+
57
+ For logging to your WandB account, use:
58
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
59
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
60
+
61
+
62
+ Results to expect
63
+ -----------------
64
+
65
+ You should see something similar to this in your terminal output when running
66
+ ths script as described above:
67
+
68
+ +---------------------+------------+-----------------+--------+------------------+
69
+ | Trial name | status | loc | iter | total time (s) |
70
+ | | | | | |
71
+ |---------------------+------------+-----------------+--------+------------------+
72
+ | PPO_env_0edd2_00000 | TERMINATED | 127.0.0.1:12632 | 17 | 42.6898 |
73
+ +---------------------+------------+-----------------+--------+------------------+
74
+ +------------------------+------------------------+------------------------+
75
+ | num_env_steps_sample | num_env_steps_traine | episode_return_mean |
76
+ | d_lifetime | d_lifetime | |
77
+ |------------------------+------------------------+------------------------|
78
+ | 68000 | 68000 | 205.22 |
79
+ +------------------------+------------------------+------------------------+
80
+ """
81
+ from ray.rllib.algorithms.ppo import PPOConfig
82
+ from ray.rllib.connectors.env_to_module import (
83
+ FlattenObservations,
84
+ PrevActionsPrevRewards,
85
+ )
86
+ from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
87
+ from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
88
+ from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole
89
+ from ray.rllib.utils.framework import try_import_torch
90
+ from ray.rllib.utils.test_utils import (
91
+ add_rllib_example_script_args,
92
+ run_rllib_example_script_experiment,
93
+ )
94
+ from ray.tune import register_env
95
+
96
+ torch, nn = try_import_torch()
97
+
98
+
99
+ parser = add_rllib_example_script_args(
100
+ default_reward=200.0, default_timesteps=1000000, default_iters=2000
101
+ )
102
+ parser.set_defaults(enable_new_api_stack=True)
103
+ parser.add_argument("--n-prev-rewards", type=int, default=1)
104
+ parser.add_argument("--n-prev-actions", type=int, default=1)
105
+
106
+
107
+ if __name__ == "__main__":
108
+ args = parser.parse_args()
109
+
110
+ # Define our custom connector pipelines.
111
+ def _env_to_module(env):
112
+ # Create the env-to-module connector pipeline.
113
+ return [
114
+ PrevActionsPrevRewards(
115
+ multi_agent=args.num_agents > 0,
116
+ n_prev_rewards=args.n_prev_rewards,
117
+ n_prev_actions=args.n_prev_actions,
118
+ ),
119
+ FlattenObservations(multi_agent=args.num_agents > 0),
120
+ ]
121
+
122
+ # Register our environment with tune.
123
+ if args.num_agents > 0:
124
+ register_env(
125
+ "env",
126
+ lambda _: MultiAgentStatelessCartPole(
127
+ config={"num_agents": args.num_agents}
128
+ ),
129
+ )
130
+ else:
131
+ register_env("env", lambda _: StatelessCartPole())
132
+
133
+ config = (
134
+ PPOConfig()
135
+ .environment("env")
136
+ .env_runners(env_to_module_connector=_env_to_module)
137
+ .training(
138
+ num_epochs=6,
139
+ lr=0.0003,
140
+ train_batch_size=4000,
141
+ vf_loss_coeff=0.01,
142
+ )
143
+ .rl_module(
144
+ model_config=DefaultModelConfig(
145
+ use_lstm=True,
146
+ max_seq_len=20,
147
+ fcnet_hiddens=[32],
148
+ fcnet_activation="linear",
149
+ fcnet_kernel_initializer=nn.init.xavier_uniform_,
150
+ fcnet_bias_initializer=nn.init.constant_,
151
+ fcnet_bias_initializer_kwargs={"val": 0.0},
152
+ vf_share_layers=True,
153
+ ),
154
+ )
155
+ )
156
+
157
+ # Add a simple multi-agent setup.
158
+ if args.num_agents > 0:
159
+ config = config.multi_agent(
160
+ policies={f"p{i}" for i in range(args.num_agents)},
161
+ policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
162
+ )
163
+
164
+ run_rllib_example_script_experiment(config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example of customizing the evaluation procedure for an RLlib Algorithm.
2
+
3
+ Note, that you should only choose to provide a custom eval function, in case the already
4
+ built-in eval options are not sufficient. Normally, though, RLlib's eval utilities
5
+ that come with each Algorithm are enough to properly evaluate the learning progress
6
+ of your Algorithm.
7
+
8
+ This script uses the SimpleCorridor environment, a simple 1D gridworld, in which
9
+ the agent can only walk left (action=0) or right (action=1). The goal state is located
10
+ at the end of the (1D) corridor. The env exposes an API to change the length of the
11
+ corridor on-the-fly. We use this API here to extend the size of the corridor for the
12
+ evaluation runs.
13
+
14
+ For demonstration purposes only, we define a simple custom evaluation method that does
15
+ the following:
16
+ - It changes the corridor length of all environments used on the evaluation EnvRunners.
17
+ - It runs a defined number of episodes for evaluation purposes.
18
+ - It collects the metrics from those runs, summarizes these metrics and returns them.
19
+
20
+
21
+ How to run this script
22
+ ----------------------
23
+ `python [script file name].py --enable-new-api-stack
24
+
25
+ You can switch off custom evaluation (and use RLlib's default evaluation procedure)
26
+ with the `--no-custom-eval` flag.
27
+
28
+ You can switch on parallel evaluation to training using the
29
+ `--evaluation-parallel-to-training` flag. See this example script here:
30
+ https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py # noqa
31
+ for more details on running evaluation parallel to training.
32
+
33
+ For debugging, use the following additional command line options
34
+ `--no-tune --num-env-runners=0`
35
+ which should allow you to set breakpoints anywhere in the RLlib code and
36
+ have the execution stop there for inspection and debugging.
37
+
38
+ For logging to your WandB account, use:
39
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
40
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
41
+
42
+
43
+ Results to expect
44
+ -----------------
45
+ You should see the following (or very similar) console output when running this script.
46
+ Note that for each iteration, due to the definition of our custom evaluation function,
47
+ we run 3 evaluation rounds per single training round.
48
+
49
+ ...
50
+ Training iteration 1 -> evaluation round 0
51
+ Training iteration 1 -> evaluation round 1
52
+ Training iteration 1 -> evaluation round 2
53
+ ...
54
+ ...
55
+ +--------------------------------+------------+-----------------+--------+
56
+ | Trial name | status | loc | iter |
57
+ |--------------------------------+------------+-----------------+--------+
58
+ | PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 | 4 |
59
+ +--------------------------------+------------+-----------------+--------+
60
+ +------------------+-------+----------+--------------------+
61
+ | total time (s) | ts | reward | episode_len_mean |
62
+ |------------------+-------+----------+--------------------|
63
+ | 26.1973 | 16000 | 0.872034 | 13.7966 |
64
+ +------------------+-------+----------+--------------------+
65
+ """
66
+ from typing import Tuple
67
+
68
+ from ray.air.constants import TRAINING_ITERATION
69
+ from ray.rllib.algorithms.algorithm import Algorithm
70
+ from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
71
+ from ray.rllib.env.env_runner_group import EnvRunnerGroup
72
+ from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor
73
+ from ray.rllib.utils.metrics import (
74
+ ENV_RUNNER_RESULTS,
75
+ EVALUATION_RESULTS,
76
+ EPISODE_RETURN_MEAN,
77
+ NUM_ENV_STEPS_SAMPLED_LIFETIME,
78
+ )
79
+ from ray.rllib.utils.test_utils import (
80
+ add_rllib_example_script_args,
81
+ run_rllib_example_script_experiment,
82
+ )
83
+ from ray.rllib.utils.typing import ResultDict
84
+ from ray.tune.registry import get_trainable_cls
85
+
86
+
87
+ parser = add_rllib_example_script_args(
88
+ default_iters=50, default_reward=0.7, default_timesteps=50000
89
+ )
90
+ parser.add_argument("--no-custom-eval", action="store_true")
91
+ parser.add_argument("--corridor-length-training", type=int, default=10)
92
+ parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20)
93
+ parser.add_argument("--corridor-length-eval-worker-2", type=int, default=30)
94
+
95
+
96
+ def custom_eval_function(
97
+ algorithm: Algorithm,
98
+ eval_workers: EnvRunnerGroup,
99
+ ) -> Tuple[ResultDict, int, int]:
100
+ """Example of a custom evaluation function.
101
+
102
+ Args:
103
+ algorithm: Algorithm class to evaluate.
104
+ eval_workers: Evaluation EnvRunnerGroup.
105
+
106
+ Returns:
107
+ metrics: Evaluation metrics dict.
108
+ """
109
+ # Set different env settings for each (eval) EnvRunner. Here we use the EnvRunner's
110
+ # `worker_index` property to figure out the actual length.
111
+ # Loop through all workers and all sub-envs (gym.Env) on each worker and call the
112
+ # `set_corridor_length` method on these.
113
+ eval_workers.foreach_env_runner(
114
+ func=lambda worker: (
115
+ env.unwrapped.set_corridor_length(
116
+ args.corridor_length_eval_worker_1
117
+ if worker.worker_index == 1
118
+ else args.corridor_length_eval_worker_2
119
+ )
120
+ for env in worker.env.unwrapped.envs
121
+ )
122
+ )
123
+
124
+ # Collect metrics results collected by eval workers in this list for later
125
+ # processing.
126
+ env_runner_metrics = []
127
+ sampled_episodes = []
128
+ # For demonstration purposes, run through some number of evaluation
129
+ # rounds within this one call. Note that this function is called once per
130
+ # training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()`
131
+ # (which can be called manually by the user).
132
+ for i in range(3):
133
+ print(f"Training iteration {algorithm.iteration} -> evaluation round {i}")
134
+ # Sample episodes from the EnvRunners AND have them return only the thus
135
+ # collected metrics.
136
+ episodes_and_metrics_all_env_runners = eval_workers.foreach_env_runner(
137
+ # Return only the metrics, NOT the sampled episodes (we don't need them
138
+ # anymore).
139
+ func=lambda worker: (worker.sample(), worker.get_metrics()),
140
+ local_env_runner=False,
141
+ )
142
+ sampled_episodes.extend(
143
+ eps
144
+ for eps_and_mtrcs in episodes_and_metrics_all_env_runners
145
+ for eps in eps_and_mtrcs[0]
146
+ )
147
+ env_runner_metrics.extend(
148
+ eps_and_mtrcs[1] for eps_and_mtrcs in episodes_and_metrics_all_env_runners
149
+ )
150
+
151
+ # You can compute metrics from the episodes manually, or use the Algorithm's
152
+ # convenient MetricsLogger to store all evaluation metrics inside the main
153
+ # algo.
154
+ algorithm.metrics.merge_and_log_n_dicts(
155
+ env_runner_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS)
156
+ )
157
+ eval_results = algorithm.metrics.reduce(
158
+ key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS)
159
+ )
160
+ # Alternatively, you could manually reduce over the n returned `env_runner_metrics`
161
+ # dicts, but this would be much harder as you might not know, which metrics
162
+ # to sum up, which ones to average over, etc..
163
+
164
+ # Compute env and agent steps from sampled episodes.
165
+ env_steps = sum(eps.env_steps() for eps in sampled_episodes)
166
+ agent_steps = sum(eps.agent_steps() for eps in sampled_episodes)
167
+
168
+ return eval_results, env_steps, agent_steps
169
+
170
+
171
+ if __name__ == "__main__":
172
+ args = parser.parse_args()
173
+ args.local_mode = True
174
+ base_config = (
175
+ get_trainable_cls(args.algo)
176
+ .get_default_config()
177
+ # For training, we use a corridor length of n. For evaluation, we use different
178
+ # values, depending on the eval worker index (1 or 2).
179
+ .environment(
180
+ SimpleCorridor,
181
+ env_config={"corridor_length": args.corridor_length_training},
182
+ )
183
+ .evaluation(
184
+ # Do we use the custom eval function defined above?
185
+ custom_evaluation_function=(
186
+ None if args.no_custom_eval else custom_eval_function
187
+ ),
188
+ # Number of eval EnvRunners to use.
189
+ evaluation_num_env_runners=2,
190
+ # Enable evaluation, once per training iteration.
191
+ evaluation_interval=1,
192
+ # Run 10 episodes each time evaluation runs (OR "auto" if parallel to
193
+ # training).
194
+ evaluation_duration="auto" if args.evaluation_parallel_to_training else 10,
195
+ # Evaluate parallelly to training?
196
+ evaluation_parallel_to_training=args.evaluation_parallel_to_training,
197
+ # Override the env settings for the eval workers.
198
+ # Note, though, that this setting here is only used in case --no-custom-eval
199
+ # is set, b/c in case the custom eval function IS used, we override the
200
+ # length of the eval environments in that custom function, so this setting
201
+ # here is simply ignored.
202
+ evaluation_config=AlgorithmConfig.overrides(
203
+ env_config={"corridor_length": args.corridor_length_training * 2},
204
+ # TODO (sven): Add support for window=float(inf) and reduce=mean for
205
+ # evaluation episode_return_mean reductions (identical to old stack
206
+ # behavior, which does NOT use a window (100 by default) to reduce
207
+ # eval episode returns.
208
+ metrics_num_episodes_for_smoothing=5,
209
+ ),
210
+ )
211
+ )
212
+
213
+ stop = {
214
+ TRAINING_ITERATION: args.stop_iters,
215
+ f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
216
+ args.stop_reward
217
+ ),
218
+ NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
219
+ }
220
+
221
+ run_rllib_example_script_experiment(
222
+ base_config,
223
+ args,
224
+ stop=stop,
225
+ success_metric={
226
+ f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
227
+ args.stop_reward
228
+ ),
229
+ },
230
+ )
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (196 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc ADDED
Binary file (12.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc ADDED
Binary file (5.57 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc ADDED
Binary file (3.66 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc ADDED
Binary file (8.71 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example of using float16 precision for training and inference.
2
+
3
+ This example:
4
+ - shows how to write a custom callback for RLlib to convert all RLModules
5
+ (on the EnvRunners and Learners) to float16 precision.
6
+ - shows how to write a custom env-to-module ConnectorV2 piece to convert all
7
+ observations and rewards in the collected trajectories to float16 (numpy) arrays.
8
+ - shows how to write a custom grad scaler for torch that is necessary to stabilize
9
+ learning with float16 weight matrices and gradients. This custom scaler behaves
10
+ exactly like the torch built-in `torch.amp.GradScaler` but also works for float16
11
+ gradients (which the torch built-in one doesn't).
12
+ - shows how to write a custom TorchLearner to change the epsilon setting (to the
13
+ much larger 1e-4 to stabilize learning) on the default optimizer (Adam) registered
14
+ for each RLModule.
15
+ - demonstrates how to plug in all the above custom components into an
16
+ `AlgorithmConfig` instance and start training (and inference) with float16
17
+ precision.
18
+
19
+
20
+ How to run this script
21
+ ----------------------
22
+ `python [script file name].py --enable-new-api-stack
23
+
24
+ For debugging, use the following additional command line options
25
+ `--no-tune --num-env-runners=0`
26
+ which should allow you to set breakpoints anywhere in the RLlib code and
27
+ have the execution stop there for inspection and debugging.
28
+
29
+ For logging to your WandB account, use:
30
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
31
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
32
+
33
+ You can visualize experiment results in ~/ray_results using TensorBoard.
34
+
35
+
36
+ Results to expect
37
+ -----------------
38
+ You should see something similar to the following on your terminal, when running this
39
+ script with the above recommended options:
40
+
41
+ +-----------------------------+------------+-----------------+--------+
42
+ | Trial name | status | loc | iter |
43
+ | | | | |
44
+ |-----------------------------+------------+-----------------+--------+
45
+ | PPO_CartPole-v1_437ee_00000 | TERMINATED | 127.0.0.1:81045 | 6 |
46
+ +-----------------------------+------------+-----------------+--------+
47
+ +------------------+------------------------+------------------------+
48
+ | total time (s) | episode_return_mean | num_episodes_lifetime |
49
+ | | | |
50
+ |------------------+------------------------+------------------------+
51
+ | 71.3123 | 153.79 | 358 |
52
+ +------------------+------------------------+------------------------+
53
+ """
54
+ import gymnasium as gym
55
+ import numpy as np
56
+ import torch
57
+
58
+ from ray.rllib.algorithms.algorithm import Algorithm
59
+ from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
60
+ from ray.rllib.connectors.connector_v2 import ConnectorV2
61
+ from ray.rllib.core.learner.torch.torch_learner import TorchLearner
62
+ from ray.rllib.utils.annotations import override
63
+ from ray.rllib.utils.test_utils import (
64
+ add_rllib_example_script_args,
65
+ run_rllib_example_script_experiment,
66
+ )
67
+ from ray.tune.registry import get_trainable_cls
68
+
69
+ parser = add_rllib_example_script_args(
70
+ default_iters=50, default_reward=150.0, default_timesteps=100000
71
+ )
72
+ parser.set_defaults(
73
+ enable_new_api_stack=True,
74
+ )
75
+
76
+
77
+ def on_algorithm_init(
78
+ algorithm: Algorithm,
79
+ **kwargs,
80
+ ) -> None:
81
+ """Callback making sure that all RLModules in the algo are `half()`'ed."""
82
+
83
+ # Switch all Learner RLModules to float16.
84
+ algorithm.learner_group.foreach_learner(
85
+ lambda learner: learner.module.foreach_module(lambda mid, mod: mod.half())
86
+ )
87
+ # Switch all EnvRunner RLModules (assuming single RLModules) to float16.
88
+ algorithm.env_runner_group.foreach_env_runner(
89
+ lambda env_runner: env_runner.module.half()
90
+ )
91
+ if algorithm.eval_env_runner_group:
92
+ algorithm.eval_env_runner_group.foreach_env_runner(
93
+ lambda env_runner: env_runner.module.half()
94
+ )
95
+
96
+
97
+ class WriteObsAndRewardsAsFloat16(ConnectorV2):
98
+ """ConnectorV2 piece preprocessing observations and rewards to be float16.
99
+
100
+ Note that users can also write a gymnasium.Wrapper for observations and rewards
101
+ to achieve the same thing.
102
+ """
103
+
104
+ def recompute_output_observation_space(
105
+ self,
106
+ input_observation_space,
107
+ input_action_space,
108
+ ):
109
+ return gym.spaces.Box(
110
+ input_observation_space.low.astype(np.float16),
111
+ input_observation_space.high.astype(np.float16),
112
+ input_observation_space.shape,
113
+ np.float16,
114
+ )
115
+
116
+ def __call__(self, *, rl_module, batch, episodes, **kwargs):
117
+ for sa_episode in self.single_agent_episode_iterator(episodes):
118
+ obs = sa_episode.get_observations(-1)
119
+ float16_obs = obs.astype(np.float16)
120
+ sa_episode.set_observations(new_data=float16_obs, at_indices=-1)
121
+ if len(sa_episode) > 0:
122
+ rew = sa_episode.get_rewards(-1).astype(np.float16)
123
+ sa_episode.set_rewards(new_data=rew, at_indices=-1)
124
+ return batch
125
+
126
+
127
+ class Float16GradScaler:
128
+ """Custom grad scaler for `TorchLearner`.
129
+
130
+ This class is utilizing the experimental support for the `TorchLearner`'s support
131
+ for loss/gradient scaling (analogous to how a `torch.amp.GradScaler` would work).
132
+
133
+ TorchLearner performs the following steps using this class (`scaler`):
134
+ - loss_per_module = TorchLearner.compute_losses()
135
+ - for L in loss_per_module: L = scaler.scale(L)
136
+ - grads = TorchLearner.compute_gradients() # L.backward() on scaled loss
137
+ - TorchLearner.apply_gradients(grads):
138
+ for optim in optimizers:
139
+ scaler.step(optim) # <- grads should get unscaled
140
+ scaler.update() # <- update scaling factor
141
+ """
142
+
143
+ def __init__(
144
+ self,
145
+ init_scale=1000.0,
146
+ growth_factor=2.0,
147
+ backoff_factor=0.5,
148
+ growth_interval=2000,
149
+ ):
150
+ self._scale = init_scale
151
+ self.growth_factor = growth_factor
152
+ self.backoff_factor = backoff_factor
153
+ self.growth_interval = growth_interval
154
+ self._found_inf_or_nan = False
155
+ self.steps_since_growth = 0
156
+
157
+ def scale(self, loss):
158
+ # Scale the loss by `self._scale`.
159
+ return loss * self._scale
160
+
161
+ def get_scale(self):
162
+ return self._scale
163
+
164
+ def step(self, optimizer):
165
+ # Unscale the gradients for all model parameters and apply.
166
+ for group in optimizer.param_groups:
167
+ for param in group["params"]:
168
+ if param.grad is not None:
169
+ param.grad.data.div_(self._scale)
170
+ if torch.isinf(param.grad).any() or torch.isnan(param.grad).any():
171
+ self._found_inf_or_nan = True
172
+ break
173
+ if self._found_inf_or_nan:
174
+ break
175
+ # Only step if no inf/NaN grad found.
176
+ if not self._found_inf_or_nan:
177
+ optimizer.step()
178
+
179
+ def update(self):
180
+ # If gradients are found to be inf/NaN, reduce the scale.
181
+ if self._found_inf_or_nan:
182
+ self._scale *= self.backoff_factor
183
+ self.steps_since_growth = 0
184
+ # Increase the scale after a set number of steps without inf/NaN.
185
+ else:
186
+ self.steps_since_growth += 1
187
+ if self.steps_since_growth >= self.growth_interval:
188
+ self._scale *= self.growth_factor
189
+ self.steps_since_growth = 0
190
+ # Reset inf/NaN flag.
191
+ self._found_inf_or_nan = False
192
+
193
+
194
+ class LargeEpsAdamTorchLearner(PPOTorchLearner):
195
+ """A TorchLearner overriding the default optimizer (Adam) to use non-default eps."""
196
+
197
+ @override(TorchLearner)
198
+ def configure_optimizers_for_module(self, module_id, config):
199
+ """Registers an Adam optimizer with a larg epsilon under the given module_id."""
200
+ params = list(self._module[module_id].parameters())
201
+
202
+ # Register one Adam optimizer (under the default optimizer name:
203
+ # DEFAULT_OPTIMIZER) for the `module_id`.
204
+ self.register_optimizer(
205
+ module_id=module_id,
206
+ # Create an Adam optimizer with a different eps for better float16
207
+ # stability.
208
+ optimizer=torch.optim.Adam(params, eps=1e-4),
209
+ params=params,
210
+ # Let RLlib handle the learning rate/learning rate schedule.
211
+ # You can leave `lr_or_lr_schedule` at None, but then you should
212
+ # pass a fixed learning rate into the Adam constructor above.
213
+ lr_or_lr_schedule=config.lr,
214
+ )
215
+
216
+
217
+ if __name__ == "__main__":
218
+ args = parser.parse_args()
219
+
220
+ base_config = (
221
+ get_trainable_cls(args.algo)
222
+ .get_default_config()
223
+ .environment("CartPole-v1")
224
+ # Plug in our custom callback (on_algorithm_init) to make all RLModules
225
+ # float16 models.
226
+ .callbacks(on_algorithm_init=on_algorithm_init)
227
+ # Plug in our custom loss scaler class to stabilize gradient computations
228
+ # (by scaling the loss, then unscaling the gradients before applying them).
229
+ # This is using the built-in, experimental feature of TorchLearner.
230
+ .experimental(_torch_grad_scaler_class=Float16GradScaler)
231
+ # Plug in our custom env-to-module ConnectorV2 piece to convert all observations
232
+ # and reward in the episodes (permanently) to float16.
233
+ .env_runners(env_to_module_connector=lambda env: WriteObsAndRewardsAsFloat16())
234
+ .training(
235
+ # Plug in our custom TorchLearner (using a much larger, stabilizing epsilon
236
+ # on the Adam optimizer).
237
+ learner_class=LargeEpsAdamTorchLearner,
238
+ # Switch off grad clipping entirely b/c we use our custom grad scaler with
239
+ # built-in inf/nan detection (see `step` method of `Float16GradScaler`).
240
+ grad_clip=None,
241
+ # Typical CartPole-v1 hyperparams known to work well:
242
+ gamma=0.99,
243
+ lr=0.0003,
244
+ num_epochs=6,
245
+ vf_loss_coeff=0.01,
246
+ use_kl_loss=True,
247
+ )
248
+ )
249
+
250
+ run_rllib_example_script_experiment(base_config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example of using fractional GPUs (< 1.0) per Learner worker.
2
+
3
+ The number of GPUs required, just for learning (excluding those maybe needed on your
4
+ EnvRunners, if applicable) can be computed by:
5
+ `num_gpus = config.num_learners * config.num_gpus_per_learner`
6
+
7
+ This example:
8
+ - shows how to set up an Algorithm that uses one or more Learner workers ...
9
+ - ... and how to assign a fractional (< 1.0) number of GPUs to each of these Learners.
10
+
11
+
12
+ How to run this script
13
+ ----------------------
14
+ `python [script file name].py --enable-new-api-stack --num-learners=
15
+ [number of Learners, e.g. 1] --num-gpus-per-learner [some fraction <1.0]`
16
+
17
+ The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU)
18
+ machine.
19
+ Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4
20
+ learning rates in the `base_config` below:
21
+ 1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used).
22
+ 2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used).
23
+ 3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used).
24
+ 4) --num-learners=2 --num-gpus-per-learner=1 (8 GPUs used).
25
+ 5) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an
26
+ NCCL-related error due to the fact that torch will try to perform DDP sharding,
27
+ but notices that the shards sit on the same GPU).
28
+
29
+ For debugging, use the following additional command line options
30
+ `--no-tune --num-env-runners=0`
31
+ which should allow you to set breakpoints anywhere in the RLlib code and
32
+ have the execution stop there for inspection and debugging.
33
+
34
+ Note that the shown GPU settings in this script also work in case you are not
35
+ running via tune, but instead are using the `--no-tune` command line option.
36
+
37
+ For logging to your WandB account, use:
38
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
39
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
40
+
41
+ You can visualize experiment results in ~/ray_results using TensorBoard.
42
+
43
+
44
+ Results to expect
45
+ -----------------
46
+ In the console output, you can see that only fractional GPUs are being used by RLlib:
47
+
48
+ == Status ==
49
+ ...
50
+ Logical resource usage: 12.0/16 CPUs, 1.0/4 GPUs (...)
51
+ ...
52
+ Number of trials: 4/4 (4 RUNNING)
53
+
54
+ The final output should look something like this:
55
+ +-----------------------------+------------+-----------------+--------+--------+
56
+ | Trial name | status | loc | lr | iter |
57
+ | | | | | |
58
+ |-----------------------------+------------+-----------------+--------+--------+
59
+ | PPO_CartPole-v1_7104b_00000 | TERMINATED | 10.0.0.39:31197 | 0.005 | 10 |
60
+ | PPO_CartPole-v1_7104b_00001 | TERMINATED | 10.0.0.39:31202 | 0.003 | 11 |
61
+ | PPO_CartPole-v1_7104b_00002 | TERMINATED | 10.0.0.39:31203 | 0.001 | 10 |
62
+ | PPO_CartPole-v1_7104b_00003 | TERMINATED | 10.0.0.39:31204 | 0.0001 | 11 |
63
+ +-----------------------------+------------+-----------------+--------+--------+
64
+
65
+ +----------------+----------------------+----------------------+----------------------+
66
+ | total time (s) | num_env_steps_sample | num_env_steps_traine | num_episodes_lifetim |
67
+ | | d_lifetime | d_lifetime | e |
68
+ |----------------+----------------------+----------------------+----------------------|
69
+ | 101.002 | 40000 | 40000 | 346 |
70
+ | 110.03 | 44000 | 44000 | 395 |
71
+ | 101.171 | 40000 | 40000 | 328 |
72
+ | 110.091 | 44000 | 44000 | 478 |
73
+ +----------------+----------------------+----------------------+----------------------+
74
+ """
75
+ from ray import tune
76
+ from ray.rllib.utils.test_utils import (
77
+ add_rllib_example_script_args,
78
+ run_rllib_example_script_experiment,
79
+ )
80
+ from ray.tune.registry import get_trainable_cls
81
+
82
+ parser = add_rllib_example_script_args(
83
+ default_iters=50, default_reward=180, default_timesteps=100000
84
+ )
85
+ parser.set_defaults(
86
+ enable_new_api_stack=True,
87
+ num_env_runners=2,
88
+ )
89
+
90
+
91
+ if __name__ == "__main__":
92
+ args = parser.parse_args()
93
+
94
+ base_config = (
95
+ get_trainable_cls(args.algo)
96
+ .get_default_config()
97
+ # This script only works on the new API stack.
98
+ .api_stack(
99
+ enable_rl_module_and_learner=True,
100
+ enable_env_runner_and_connector_v2=True,
101
+ )
102
+ .environment("CartPole-v1")
103
+ # Define EnvRunner scaling.
104
+ .env_runners(num_env_runners=args.num_env_runners)
105
+ # Define Learner scaling.
106
+ .learners(
107
+ # How many Learner workers do we need? If you have more than 1 GPU,
108
+ # set this parameter to the number of GPUs available.
109
+ num_learners=args.num_learners,
110
+ # How many GPUs does each Learner need? If you have more than 1 GPU or only
111
+ # one Learner, you should set this to 1, otherwise, set this to some
112
+ # fraction.
113
+ num_gpus_per_learner=args.num_gpus_per_learner,
114
+ )
115
+ # 4 tune trials altogether.
116
+ .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001]))
117
+ )
118
+
119
+ run_rllib_example_script_experiment(base_config, args, keep_config=True)
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example of using GPUs on the EnvRunners (b/c Env and/or RLModule require these).
2
+
3
+ The number of GPUs required, just for your EnvRunners (excluding those needed for
4
+ training your RLModule) can be computed by:
5
+ `num_gpus = config.num_env_runners * config.num_gpus_per_env_runner`
6
+
7
+ This example:
8
+ - shows how to write an Env that uses the GPU.
9
+ - shows how to configure your algorithm such that it allocates any number of GPUs
10
+ (including fractional < 1.0) to each (remote) EnvRunner worker.
11
+
12
+
13
+ How to run this script
14
+ ----------------------
15
+ `python [script file name].py --enable-new-api-stack --num-env_runners=
16
+ [number of EnvRunners, e.g. 2] --num-gpus-per-env-runner [int or some fraction <1.0]`
17
+
18
+ The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU)
19
+ machine.
20
+ TODO (sven): Fix these
21
+ Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4
22
+ learning rates in the `base_config` below:
23
+ 1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used).
24
+ 2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used).
25
+ 3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used).
26
+ 4) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an
27
+ NCCL-related error due to the fact that torch will try to perform DDP sharding,
28
+ but notices that the shards sit on the same GPU).
29
+
30
+ For debugging, use the following additional command line options
31
+ `--no-tune --num-env-runners=0`
32
+ which should allow you to set breakpoints anywhere in the RLlib code and
33
+ have the execution stop there for inspection and debugging.
34
+
35
+ Note that the shown GPU settings in this script also work in case you are not
36
+ running via tune, but instead are using the `--no-tune` command line option.
37
+
38
+ For logging to your WandB account, use:
39
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
40
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
41
+
42
+ You can visualize experiment results in ~/ray_results using TensorBoard.
43
+
44
+
45
+ Results to expect
46
+ -----------------
47
+ In the console output, you can see that only fractional GPUs are being used by RLlib:
48
+
49
+ """
50
+ from ray.rllib.examples.envs.classes.gpu_requiring_env import GPURequiringEnv
51
+ from ray.rllib.utils.test_utils import (
52
+ add_rllib_example_script_args,
53
+ run_rllib_example_script_experiment,
54
+ )
55
+ from ray.tune.registry import get_trainable_cls
56
+
57
+ parser = add_rllib_example_script_args(
58
+ default_iters=50, default_reward=0.9, default_timesteps=100000
59
+ )
60
+ parser.set_defaults(
61
+ enable_new_api_stack=True,
62
+ num_env_runners=2,
63
+ )
64
+ parser.add_argument("--num-gpus-per-env-runner", type=float, default=0.5)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ args = parser.parse_args()
69
+
70
+ base_config = (
71
+ get_trainable_cls(args.algo)
72
+ .get_default_config()
73
+ .environment(GPURequiringEnv)
74
+ # Define Learner scaling.
75
+ .env_runners(
76
+ # How many EnvRunner workers do we need?
77
+ num_env_runners=args.num_env_runners,
78
+ # How many GPUs does each EnvRunner require? Note that the memory on (a
79
+ # possibly fractional GPU) must be enough to accommodate the RLModule AND
80
+ # if applicable also the Env's GPU needs).
81
+ num_gpus_per_env_runner=args.num_gpus_per_env_runner,
82
+ )
83
+ )
84
+
85
+ run_rllib_example_script_experiment(base_config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example of using automatic mixed precision training on a torch RLModule.
2
+
3
+ This example:
4
+ - shows how to write a custom callback for RLlib to convert those RLModules
5
+ only(!) on the EnvRunners to float16 precision.
6
+ - shows how to write a custom env-to-module ConnectorV2 piece to add float16
7
+ observations to the action computing forward batch on the EnvRunners, but NOT
8
+ permanently write these changes into the episodes, such that on the
9
+ Learner side, the original float32 observations will be used (for the mixed
10
+ precision `forward_train` and `loss` computations).
11
+ - shows how to plugin torch's built-in `GradScaler` class to be used by the
12
+ TorchLearner to scale losses and unscale gradients in order to gain more stability
13
+ when training with mixed precision.
14
+ - shows how to write a custom TorchLearner to run the update step (overrides
15
+ `_update()`) within a `torch.amp.autocast()` context. This makes sure that .
16
+ - demonstrates how to plug in all the above custom components into an
17
+ `AlgorithmConfig` instance and start training with mixed-precision while
18
+ performing the inference on the EnvRunners with float16 precision.
19
+
20
+
21
+ How to run this script
22
+ ----------------------
23
+ `python [script file name].py --enable-new-api-stack
24
+
25
+ For debugging, use the following additional command line options
26
+ `--no-tune --num-env-runners=0`
27
+ which should allow you to set breakpoints anywhere in the RLlib code and
28
+ have the execution stop there for inspection and debugging.
29
+
30
+ Note that the shown GPU settings in this script also work in case you are not
31
+ running via tune, but instead are using the `--no-tune` command line option.
32
+
33
+ For logging to your WandB account, use:
34
+ `--wandb-key=[your WandB API key] --wandb-project=[some project name]
35
+ --wandb-run-name=[optional: WandB run name (within the defined project)]`
36
+
37
+ You can visualize experiment results in ~/ray_results using TensorBoard.
38
+
39
+
40
+ Results to expect
41
+ -----------------
42
+ In the console output, you should see something like this:
43
+
44
+ +-----------------------------+------------+-----------------+--------+
45
+ | Trial name | status | loc | iter |
46
+ | | | | |
47
+ |-----------------------------+------------+-----------------+--------+
48
+ | PPO_CartPole-v1_485af_00000 | TERMINATED | 127.0.0.1:81045 | 22 |
49
+ +-----------------------------+------------+-----------------+--------+
50
+ +------------------+------------------------+------------------------+
51
+ | total time (s) | episode_return_mean | num_episodes_lifetime |
52
+ | | | |
53
+ |------------------+------------------------+------------------------+
54
+ | 281.3231 | 455.81 | 1426 |
55
+ +------------------+------------------------+------------------------+
56
+ """
57
+ import gymnasium as gym
58
+ import numpy as np
59
+ import torch
60
+
61
+ from ray.rllib.algorithms.algorithm import Algorithm
62
+ from ray.rllib.algorithms.ppo import PPOConfig
63
+ from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
64
+ from ray.rllib.connectors.connector_v2 import ConnectorV2
65
+ from ray.rllib.utils.test_utils import (
66
+ add_rllib_example_script_args,
67
+ run_rllib_example_script_experiment,
68
+ )
69
+
70
+
71
+ parser = add_rllib_example_script_args(
72
+ default_iters=200, default_reward=450.0, default_timesteps=200000
73
+ )
74
+ parser.set_defaults(
75
+ algo="PPO",
76
+ enable_new_api_stack=True,
77
+ )
78
+
79
+
80
+ def on_algorithm_init(
81
+ algorithm: Algorithm,
82
+ **kwargs,
83
+ ) -> None:
84
+ """Callback making sure that all RLModules in the algo are `half()`'ed."""
85
+
86
+ # Switch all EnvRunner RLModules (assuming single RLModules) to float16.
87
+ algorithm.env_runner_group.foreach_env_runner(
88
+ lambda env_runner: env_runner.module.half()
89
+ )
90
+ if algorithm.eval_env_runner_group:
91
+ algorithm.eval_env_runner_group.foreach_env_runner(
92
+ lambda env_runner: env_runner.module.half()
93
+ )
94
+
95
+
96
+ class Float16Connector(ConnectorV2):
97
+ """ConnectorV2 piece preprocessing observations and rewards to be float16.
98
+
99
+ Note that users can also write a gymnasium.Wrapper for observations and rewards
100
+ to achieve the same thing.
101
+ """
102
+
103
+ def recompute_output_observation_space(
104
+ self,
105
+ input_observation_space,
106
+ input_action_space,
107
+ ):
108
+ return gym.spaces.Box(
109
+ input_observation_space.low.astype(np.float16),
110
+ input_observation_space.high.astype(np.float16),
111
+ input_observation_space.shape,
112
+ np.float16,
113
+ )
114
+
115
+ def __call__(self, *, rl_module, batch, episodes, **kwargs):
116
+ for sa_episode in self.single_agent_episode_iterator(episodes):
117
+ obs = sa_episode.get_observations(-1)
118
+ float16_obs = obs.astype(np.float16)
119
+ self.add_batch_item(
120
+ batch,
121
+ column="obs",
122
+ item_to_add=float16_obs,
123
+ single_agent_episode=sa_episode,
124
+ )
125
+ return batch
126
+
127
+
128
+ class PPOTorchMixedPrecisionLearner(PPOTorchLearner):
129
+ def _update(self, *args, **kwargs):
130
+ with torch.cuda.amp.autocast():
131
+ results = super()._update(*args, **kwargs)
132
+ return results
133
+
134
+
135
+ if __name__ == "__main__":
136
+ args = parser.parse_args()
137
+
138
+ assert (
139
+ args.enable_new_api_stack
140
+ ), "Must set --enable-new-api-stack when running this script!"
141
+ assert args.algo == "PPO", "Must set --algo=PPO when running this script!"
142
+
143
+ base_config = (
144
+ (PPOConfig().environment("CartPole-v1"))
145
+ .env_runners(env_to_module_connector=lambda env: Float16Connector())
146
+ # Plug in our custom callback (on_algorithm_init) to make EnvRunner RLModules
147
+ # float16 models.
148
+ .callbacks(on_algorithm_init=on_algorithm_init)
149
+ # Plug in the torch built-int loss scaler class to stabilize gradient
150
+ # computations (by scaling the loss, then unscaling the gradients before
151
+ # applying them). This is using the built-in, experimental feature of
152
+ # TorchLearner.
153
+ .experimental(_torch_grad_scaler_class=torch.cuda.amp.GradScaler)
154
+ .training(
155
+ # Plug in the custom Learner class to activate mixed-precision training for
156
+ # our torch RLModule (uses `torch.amp.autocast()`).
157
+ learner_class=PPOTorchMixedPrecisionLearner,
158
+ # Switch off grad clipping entirely b/c we use our custom grad scaler with
159
+ # built-in inf/nan detection (see `step` method of `Float16GradScaler`).
160
+ grad_clip=None,
161
+ # Typical CartPole-v1 hyperparams known to work well:
162
+ gamma=0.99,
163
+ lr=0.0003,
164
+ num_epochs=6,
165
+ vf_loss_coeff=0.01,
166
+ use_kl_loss=True,
167
+ )
168
+ )
169
+
170
+ run_rllib_example_script_experiment(base_config, args)
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc ADDED
Binary file (3.21 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc ADDED
Binary file (3.08 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc ADDED
Binary file (4.06 kB). View file
 
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc ADDED
Binary file (4.63 kB). View file