Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py +109 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py +86 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py +92 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py +122 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py +80 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py +14 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py +14 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py +154 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py +228 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py +198 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py +164 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py +230 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py +250 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py +119 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py +85 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py +170 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (191 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc
ADDED
|
Binary file (14.7 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc
ADDED
|
Binary file (6.98 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc
ADDED
|
Binary file (5.44 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc
ADDED
|
Binary file (3.21 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc
ADDED
|
Binary file (6.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (199 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc
ADDED
|
Binary file (4.83 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example on how to define and run with an RLModule with a dependent action space.
|
| 2 |
+
|
| 3 |
+
This examples:
|
| 4 |
+
- Shows how to write a custom RLModule outputting autoregressive actions.
|
| 5 |
+
The RLModule class used here implements a prior distribution for the first couple
|
| 6 |
+
of actions and then uses the sampled actions to compute the parameters for and
|
| 7 |
+
sample from a posterior distribution.
|
| 8 |
+
- Shows how to configure a PPO algorithm to use the custom RLModule.
|
| 9 |
+
- Stops the training after 100k steps or when the mean episode return
|
| 10 |
+
exceeds -0.012 in evaluation, i.e. if the agent has learned to
|
| 11 |
+
synchronize its actions.
|
| 12 |
+
|
| 13 |
+
For details on the environment used, take a look at the `CorrelatedActionsEnv`
|
| 14 |
+
class. To receive an episode return over 100, the agent must learn how to synchronize
|
| 15 |
+
its actions.
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
How to run this script
|
| 19 |
+
----------------------
|
| 20 |
+
`python [script file name].py --enable-new-api-stack --num-env-runners 2`
|
| 21 |
+
|
| 22 |
+
Control the number of `EnvRunner`s with the `--num-env-runners` flag. This
|
| 23 |
+
will increase the sampling speed.
|
| 24 |
+
|
| 25 |
+
For debugging, use the following additional command line options
|
| 26 |
+
`--no-tune --num-env-runners=0`
|
| 27 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 28 |
+
have the execution stop there for inspection and debugging.
|
| 29 |
+
|
| 30 |
+
For logging to your WandB account, use:
|
| 31 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 32 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
Results to expect
|
| 36 |
+
-----------------
|
| 37 |
+
You should reach an episode return of better than -0.5 quickly through a simple PPO
|
| 38 |
+
policy. The logic behind beating the env is roughly:
|
| 39 |
+
|
| 40 |
+
OBS: optimal a1: r1: optimal a2: r2:
|
| 41 |
+
-1 2 0 -1.0 0
|
| 42 |
+
-0.5 1/2 -0.5 -0.5/-1.5 0
|
| 43 |
+
0 1 0 -1.0 0
|
| 44 |
+
0.5 0/1 -0.5 -0.5/-1.5 0
|
| 45 |
+
1 0 0 -1.0 0
|
| 46 |
+
|
| 47 |
+
Meaning, most of the time, you would receive a reward better than -0.5, but worse than
|
| 48 |
+
0.0.
|
| 49 |
+
|
| 50 |
+
+--------------------------------------+------------+--------+------------------+
|
| 51 |
+
| Trial name | status | iter | total time (s) |
|
| 52 |
+
| | | | |
|
| 53 |
+
|--------------------------------------+------------+--------+------------------+
|
| 54 |
+
| PPO_CorrelatedActionsEnv_6660d_00000 | TERMINATED | 76 | 132.438 |
|
| 55 |
+
+--------------------------------------+------------+--------+------------------+
|
| 56 |
+
+------------------------+------------------------+------------------------+
|
| 57 |
+
| episode_return_mean | num_env_steps_sample | ...env_steps_sampled |
|
| 58 |
+
| | d_lifetime | _lifetime_throughput |
|
| 59 |
+
|------------------------+------------------------+------------------------|
|
| 60 |
+
| -0.43 | 152000 | 1283.48 |
|
| 61 |
+
+------------------------+------------------------+------------------------+
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
from ray.rllib.algorithms.ppo import PPOConfig
|
| 65 |
+
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
|
| 66 |
+
from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv
|
| 67 |
+
from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import (
|
| 68 |
+
AutoregressiveActionsRLM,
|
| 69 |
+
)
|
| 70 |
+
from ray.rllib.utils.test_utils import (
|
| 71 |
+
add_rllib_example_script_args,
|
| 72 |
+
run_rllib_example_script_experiment,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
parser = add_rllib_example_script_args(
|
| 77 |
+
default_iters=1000,
|
| 78 |
+
default_timesteps=2000000,
|
| 79 |
+
default_reward=-0.45,
|
| 80 |
+
)
|
| 81 |
+
parser.set_defaults(enable_new_api_stack=True)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
args = parser.parse_args()
|
| 86 |
+
|
| 87 |
+
if args.algo != "PPO":
|
| 88 |
+
raise ValueError(
|
| 89 |
+
"This example script only runs with PPO! Set --algo=PPO on the command "
|
| 90 |
+
"line."
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
base_config = (
|
| 94 |
+
PPOConfig()
|
| 95 |
+
.environment(CorrelatedActionsEnv)
|
| 96 |
+
.training(
|
| 97 |
+
train_batch_size_per_learner=2000,
|
| 98 |
+
num_epochs=12,
|
| 99 |
+
minibatch_size=256,
|
| 100 |
+
entropy_coeff=0.005,
|
| 101 |
+
lr=0.0003,
|
| 102 |
+
)
|
| 103 |
+
# Specify the RLModule class to be used.
|
| 104 |
+
.rl_module(
|
| 105 |
+
rl_module_spec=RLModuleSpec(module_class=AutoregressiveActionsRLM),
|
| 106 |
+
)
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
run_rllib_example_script_experiment(base_config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete
|
| 2 |
+
|
| 3 |
+
from ray.tune.registry import register_env
|
| 4 |
+
from ray.rllib.connectors.env_to_module import FlattenObservations
|
| 5 |
+
from ray.rllib.examples.envs.classes.multi_agent import (
|
| 6 |
+
MultiAgentNestedSpaceRepeatAfterMeEnv,
|
| 7 |
+
)
|
| 8 |
+
from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import (
|
| 9 |
+
NestedSpaceRepeatAfterMeEnv,
|
| 10 |
+
)
|
| 11 |
+
from ray.rllib.utils.test_utils import (
|
| 12 |
+
add_rllib_example_script_args,
|
| 13 |
+
run_rllib_example_script_experiment,
|
| 14 |
+
)
|
| 15 |
+
from ray.tune.registry import get_trainable_cls
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Read in common example script command line arguments.
|
| 19 |
+
parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0)
|
| 20 |
+
parser.set_defaults(enable_new_api_stack=True)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
if __name__ == "__main__":
|
| 24 |
+
args = parser.parse_args()
|
| 25 |
+
|
| 26 |
+
# Define env-to-module-connector pipeline for the new stack.
|
| 27 |
+
def _env_to_module_pipeline(env):
|
| 28 |
+
return FlattenObservations(multi_agent=args.num_agents > 0)
|
| 29 |
+
|
| 30 |
+
# Register our environment with tune.
|
| 31 |
+
if args.num_agents > 0:
|
| 32 |
+
register_env(
|
| 33 |
+
"env",
|
| 34 |
+
lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv(
|
| 35 |
+
config=dict(c, **{"num_agents": args.num_agents})
|
| 36 |
+
),
|
| 37 |
+
)
|
| 38 |
+
else:
|
| 39 |
+
register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c))
|
| 40 |
+
|
| 41 |
+
# Define the AlgorithmConfig used.
|
| 42 |
+
base_config = (
|
| 43 |
+
get_trainable_cls(args.algo)
|
| 44 |
+
.get_default_config()
|
| 45 |
+
.environment(
|
| 46 |
+
"env",
|
| 47 |
+
env_config={
|
| 48 |
+
"space": Dict(
|
| 49 |
+
{
|
| 50 |
+
"a": Tuple(
|
| 51 |
+
[Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})]
|
| 52 |
+
),
|
| 53 |
+
"b": Box(-10.0, 10.0, (2,)),
|
| 54 |
+
"c": MultiDiscrete([3, 3]),
|
| 55 |
+
"d": Discrete(2),
|
| 56 |
+
}
|
| 57 |
+
),
|
| 58 |
+
"episode_len": 100,
|
| 59 |
+
},
|
| 60 |
+
)
|
| 61 |
+
.env_runners(env_to_module_connector=_env_to_module_pipeline)
|
| 62 |
+
# No history in Env (bandit problem).
|
| 63 |
+
.training(
|
| 64 |
+
gamma=0.0,
|
| 65 |
+
lr=0.0005,
|
| 66 |
+
)
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Add a simple multi-agent setup.
|
| 70 |
+
if args.num_agents > 0:
|
| 71 |
+
base_config.multi_agent(
|
| 72 |
+
policies={f"p{i}" for i in range(args.num_agents)},
|
| 73 |
+
policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Fix some PPO-specific settings.
|
| 77 |
+
if args.algo == "PPO":
|
| 78 |
+
base_config.training(
|
| 79 |
+
# We don't want high entropy in this Env.
|
| 80 |
+
entropy_coeff=0.00005,
|
| 81 |
+
num_epochs=4,
|
| 82 |
+
vf_loss_coeff=0.01,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# Run everything as configured.
|
| 86 |
+
run_rllib_example_script_experiment(base_config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (202 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc
ADDED
|
Binary file (719 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc
ADDED
|
Binary file (745 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc
ADDED
|
Binary file (7.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc
ADDED
|
Binary file (9.67 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc
ADDED
|
Binary file (8.85 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc
ADDED
|
Binary file (8.54 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (210 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc
ADDED
|
Binary file (4.54 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc
ADDED
|
Binary file (5.88 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc
ADDED
|
Binary file (3.86 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
from typing import Any, List, Optional
|
| 3 |
+
|
| 4 |
+
import gymnasium as gym
|
| 5 |
+
|
| 6 |
+
from ray.rllib.connectors.connector_v2 import ConnectorV2
|
| 7 |
+
from ray.rllib.core.rl_module.rl_module import RLModule
|
| 8 |
+
from ray.rllib.utils.typing import EpisodeType
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class CountBasedCuriosity(ConnectorV2):
|
| 12 |
+
"""Learner ConnectorV2 piece to compute intrinsic rewards based on obs counts.
|
| 13 |
+
|
| 14 |
+
Add this connector piece to your Learner pipeline, through your algo config:
|
| 15 |
+
```
|
| 16 |
+
config.training(
|
| 17 |
+
learner_connector=lambda obs_sp, act_sp: CountBasedCuriosity()
|
| 18 |
+
)
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
Intrinsic rewards are computed on the Learner side based on naive observation
|
| 22 |
+
counts, which is why this connector should only be used for simple environments
|
| 23 |
+
with a reasonable number of possible observations. The intrinsic reward for a given
|
| 24 |
+
timestep is:
|
| 25 |
+
r(i) = intrinsic_reward_coeff * (1 / C(obs(i)))
|
| 26 |
+
where C is the total (lifetime) count of the obs at timestep i.
|
| 27 |
+
|
| 28 |
+
The intrinsic reward is added to the extrinsic reward and saved back into the
|
| 29 |
+
episode (under the main "rewards" key).
|
| 30 |
+
|
| 31 |
+
Note that the computation and saving back to the episode all happens before the
|
| 32 |
+
actual train batch is generated from the episode data. Thus, the Learner and the
|
| 33 |
+
RLModule used do not take notice of the extra reward added.
|
| 34 |
+
|
| 35 |
+
If you would like to use a more sophisticated mechanism for intrinsic reward
|
| 36 |
+
computations, take a look at the `EuclidianDistanceBasedCuriosity` connector piece
|
| 37 |
+
at `ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity`
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(
|
| 41 |
+
self,
|
| 42 |
+
input_observation_space: Optional[gym.Space] = None,
|
| 43 |
+
input_action_space: Optional[gym.Space] = None,
|
| 44 |
+
*,
|
| 45 |
+
intrinsic_reward_coeff: float = 1.0,
|
| 46 |
+
**kwargs,
|
| 47 |
+
):
|
| 48 |
+
"""Initializes a CountBasedCuriosity instance.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
intrinsic_reward_coeff: The weight with which to multiply the intrinsic
|
| 52 |
+
reward before adding (and saving) it back to the main (extrinsic)
|
| 53 |
+
reward of the episode at each timestep.
|
| 54 |
+
"""
|
| 55 |
+
super().__init__(input_observation_space, input_action_space)
|
| 56 |
+
|
| 57 |
+
# Naive observation counter.
|
| 58 |
+
self._counts = Counter()
|
| 59 |
+
self.intrinsic_reward_coeff = intrinsic_reward_coeff
|
| 60 |
+
|
| 61 |
+
def __call__(
|
| 62 |
+
self,
|
| 63 |
+
*,
|
| 64 |
+
rl_module: RLModule,
|
| 65 |
+
batch: Any,
|
| 66 |
+
episodes: List[EpisodeType],
|
| 67 |
+
explore: Optional[bool] = None,
|
| 68 |
+
shared_data: Optional[dict] = None,
|
| 69 |
+
**kwargs,
|
| 70 |
+
) -> Any:
|
| 71 |
+
# Loop through all episodes and change the reward to
|
| 72 |
+
# [reward + intrinsic reward]
|
| 73 |
+
for sa_episode in self.single_agent_episode_iterator(
|
| 74 |
+
episodes=episodes, agents_that_stepped_only=False
|
| 75 |
+
):
|
| 76 |
+
# Loop through all obs, except the last one.
|
| 77 |
+
observations = sa_episode.get_observations(slice(None, -1))
|
| 78 |
+
# Get all respective (extrinsic) rewards.
|
| 79 |
+
rewards = sa_episode.get_rewards()
|
| 80 |
+
|
| 81 |
+
for i, (obs, rew) in enumerate(zip(observations, rewards)):
|
| 82 |
+
obs = tuple(obs)
|
| 83 |
+
# Add 1 to obs counter.
|
| 84 |
+
self._counts[obs] += 1
|
| 85 |
+
# Compute our count-based intrinsic reward and add it to the main
|
| 86 |
+
# (extrinsic) reward.
|
| 87 |
+
rew += self.intrinsic_reward_coeff * (1 / self._counts[obs])
|
| 88 |
+
# Store the new reward back to the episode (under the correct
|
| 89 |
+
# timestep/index).
|
| 90 |
+
sa_episode.set_rewards(new_data=rew, at_indices=i)
|
| 91 |
+
|
| 92 |
+
return batch
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import deque
|
| 2 |
+
from typing import Any, List, Optional
|
| 3 |
+
|
| 4 |
+
import gymnasium as gym
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from ray.rllib.connectors.connector_v2 import ConnectorV2
|
| 8 |
+
from ray.rllib.core.rl_module.rl_module import RLModule
|
| 9 |
+
from ray.rllib.utils.typing import EpisodeType
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class EuclidianDistanceBasedCuriosity(ConnectorV2):
|
| 13 |
+
"""Learner ConnectorV2 piece computing intrinsic rewards with euclidian distance.
|
| 14 |
+
|
| 15 |
+
Add this connector piece to your Learner pipeline, through your algo config:
|
| 16 |
+
```
|
| 17 |
+
config.training(
|
| 18 |
+
learner_connector=lambda obs_sp, act_sp: EuclidianDistanceBasedCuriosity()
|
| 19 |
+
)
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
Intrinsic rewards are computed on the Learner side based on comparing the euclidian
|
| 23 |
+
distance of observations vs already seen ones. A configurable number of observations
|
| 24 |
+
will be stored in a FIFO buffer and all incoming observations have their distance
|
| 25 |
+
measured against those.
|
| 26 |
+
|
| 27 |
+
The minimum distance measured is the intrinsic reward for the incoming obs
|
| 28 |
+
(multiplied by a fixed coeffieicnt and added to the "main" extrinsic reward):
|
| 29 |
+
r(i) = intrinsic_reward_coeff * min(ED(o, o(i)) for o in stored_obs))
|
| 30 |
+
where `ED` is the euclidian distance and `stored_obs` is the buffer.
|
| 31 |
+
|
| 32 |
+
The intrinsic reward is then added to the extrinsic reward and saved back into the
|
| 33 |
+
episode (under the main "rewards" key).
|
| 34 |
+
|
| 35 |
+
Note that the computation and saving back to the episode all happens before the
|
| 36 |
+
actual train batch is generated from the episode data. Thus, the Learner and the
|
| 37 |
+
RLModule used do not take notice of the extra reward added.
|
| 38 |
+
|
| 39 |
+
Only one observation per incoming episode will be stored as a new one in the buffer.
|
| 40 |
+
Thereby, we pick the observation with the largest `min(ED)` value over all already
|
| 41 |
+
stored observations to be stored per episode.
|
| 42 |
+
|
| 43 |
+
If you would like to use a simpler, count-based mechanism for intrinsic reward
|
| 44 |
+
computations, take a look at the `CountBasedCuriosity` connector piece
|
| 45 |
+
at `ray.rllib.examples.connectors.classes.count_based_curiosity`
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
def __init__(
|
| 49 |
+
self,
|
| 50 |
+
input_observation_space: Optional[gym.Space] = None,
|
| 51 |
+
input_action_space: Optional[gym.Space] = None,
|
| 52 |
+
*,
|
| 53 |
+
intrinsic_reward_coeff: float = 1.0,
|
| 54 |
+
max_buffer_size: int = 100,
|
| 55 |
+
**kwargs,
|
| 56 |
+
):
|
| 57 |
+
"""Initializes a CountBasedCuriosity instance.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
intrinsic_reward_coeff: The weight with which to multiply the intrinsic
|
| 61 |
+
reward before adding (and saving) it back to the main (extrinsic)
|
| 62 |
+
reward of the episode at each timestep.
|
| 63 |
+
"""
|
| 64 |
+
super().__init__(input_observation_space, input_action_space)
|
| 65 |
+
|
| 66 |
+
# Create an observation buffer
|
| 67 |
+
self.obs_buffer = deque(maxlen=max_buffer_size)
|
| 68 |
+
self.intrinsic_reward_coeff = intrinsic_reward_coeff
|
| 69 |
+
|
| 70 |
+
self._test = 0
|
| 71 |
+
|
| 72 |
+
def __call__(
|
| 73 |
+
self,
|
| 74 |
+
*,
|
| 75 |
+
rl_module: RLModule,
|
| 76 |
+
batch: Any,
|
| 77 |
+
episodes: List[EpisodeType],
|
| 78 |
+
explore: Optional[bool] = None,
|
| 79 |
+
shared_data: Optional[dict] = None,
|
| 80 |
+
**kwargs,
|
| 81 |
+
) -> Any:
|
| 82 |
+
if self._test > 10:
|
| 83 |
+
return batch
|
| 84 |
+
self._test += 1
|
| 85 |
+
# Loop through all episodes and change the reward to
|
| 86 |
+
# [reward + intrinsic reward]
|
| 87 |
+
for sa_episode in self.single_agent_episode_iterator(
|
| 88 |
+
episodes=episodes, agents_that_stepped_only=False
|
| 89 |
+
):
|
| 90 |
+
# Loop through all obs, except the last one.
|
| 91 |
+
observations = sa_episode.get_observations(slice(None, -1))
|
| 92 |
+
# Get all respective (extrinsic) rewards.
|
| 93 |
+
rewards = sa_episode.get_rewards()
|
| 94 |
+
|
| 95 |
+
max_dist_obs = None
|
| 96 |
+
max_dist = float("-inf")
|
| 97 |
+
for i, (obs, rew) in enumerate(zip(observations, rewards)):
|
| 98 |
+
# Compare obs to all stored observations and compute euclidian distance.
|
| 99 |
+
min_dist = 0.0
|
| 100 |
+
if self.obs_buffer:
|
| 101 |
+
min_dist = min(
|
| 102 |
+
np.sqrt(np.sum((obs - stored_obs) ** 2))
|
| 103 |
+
for stored_obs in self.obs_buffer
|
| 104 |
+
)
|
| 105 |
+
if min_dist > max_dist:
|
| 106 |
+
max_dist = min_dist
|
| 107 |
+
max_dist_obs = obs
|
| 108 |
+
|
| 109 |
+
# Compute our euclidian distance-based intrinsic reward and add it to
|
| 110 |
+
# the main (extrinsic) reward.
|
| 111 |
+
rew += self.intrinsic_reward_coeff * min_dist
|
| 112 |
+
# Store the new reward back to the episode (under the correct
|
| 113 |
+
# timestep/index).
|
| 114 |
+
sa_episode.set_rewards(new_data=rew, at_indices=i)
|
| 115 |
+
|
| 116 |
+
# Add the one observation of this episode with the largest (min) euclidian
|
| 117 |
+
# dist to all already stored obs to the buffer (maybe throwing out the
|
| 118 |
+
# oldest obs in there).
|
| 119 |
+
if max_dist_obs is not None:
|
| 120 |
+
self.obs_buffer.append(max_dist_obs)
|
| 121 |
+
|
| 122 |
+
return batch
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, List, Optional
|
| 2 |
+
|
| 3 |
+
import gymnasium as gym
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
from ray.rllib.connectors.connector_v2 import ConnectorV2
|
| 7 |
+
from ray.rllib.core.rl_module.rl_module import RLModule
|
| 8 |
+
from ray.rllib.examples.envs.classes.utils.cartpole_observations_proto import (
|
| 9 |
+
CartPoleObservation,
|
| 10 |
+
)
|
| 11 |
+
from ray.rllib.utils.annotations import override
|
| 12 |
+
from ray.rllib.utils.typing import EpisodeType
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ProtobufCartPoleObservationDecoder(ConnectorV2):
|
| 16 |
+
"""Env-to-module ConnectorV2 piece decoding protobuf obs into CartPole-v1 obs.
|
| 17 |
+
|
| 18 |
+
Add this connector piece to your env-to-module pipeline, through your algo config:
|
| 19 |
+
```
|
| 20 |
+
config.env_runners(
|
| 21 |
+
env_to_module_connector=lambda env: ProtobufCartPoleObservationDecoder()
|
| 22 |
+
)
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
The incoming observation space must be a 1D Box of dtype uint8
|
| 26 |
+
(which is the same as a binary string). The outgoing observation space is the
|
| 27 |
+
normal CartPole-v1 1D space: Box(-inf, inf, (4,), float32).
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
@override(ConnectorV2)
|
| 31 |
+
def recompute_output_observation_space(
|
| 32 |
+
self,
|
| 33 |
+
input_observation_space: gym.Space,
|
| 34 |
+
input_action_space: gym.Space,
|
| 35 |
+
) -> gym.Space:
|
| 36 |
+
# Make sure the incoming observation space is a protobuf (binary string).
|
| 37 |
+
assert (
|
| 38 |
+
isinstance(input_observation_space, gym.spaces.Box)
|
| 39 |
+
and len(input_observation_space.shape) == 1
|
| 40 |
+
and input_observation_space.dtype.name == "uint8"
|
| 41 |
+
)
|
| 42 |
+
# Return CartPole-v1's natural observation space.
|
| 43 |
+
return gym.spaces.Box(float("-inf"), float("inf"), (4,), np.float32)
|
| 44 |
+
|
| 45 |
+
def __call__(
|
| 46 |
+
self,
|
| 47 |
+
*,
|
| 48 |
+
rl_module: RLModule,
|
| 49 |
+
batch: Any,
|
| 50 |
+
episodes: List[EpisodeType],
|
| 51 |
+
explore: Optional[bool] = None,
|
| 52 |
+
shared_data: Optional[dict] = None,
|
| 53 |
+
**kwargs,
|
| 54 |
+
) -> Any:
|
| 55 |
+
# Loop through all episodes and change the observation from a binary string
|
| 56 |
+
# to an actual 1D np.ndarray (normal CartPole-v1 obs).
|
| 57 |
+
for sa_episode in self.single_agent_episode_iterator(episodes=episodes):
|
| 58 |
+
# Get last obs (binary string).
|
| 59 |
+
obs = sa_episode.get_observations(-1)
|
| 60 |
+
obs_bytes = obs.tobytes()
|
| 61 |
+
obs_protobuf = CartPoleObservation()
|
| 62 |
+
obs_protobuf.ParseFromString(obs_bytes)
|
| 63 |
+
|
| 64 |
+
# Set up the natural CartPole-v1 observation tensor from the protobuf
|
| 65 |
+
# values.
|
| 66 |
+
new_obs = np.array(
|
| 67 |
+
[
|
| 68 |
+
obs_protobuf.x_pos,
|
| 69 |
+
obs_protobuf.x_veloc,
|
| 70 |
+
obs_protobuf.angle_pos,
|
| 71 |
+
obs_protobuf.angle_veloc,
|
| 72 |
+
],
|
| 73 |
+
np.float32,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Write the new observation (1D tensor) back into the Episode.
|
| 77 |
+
sa_episode.set_observations(new_data=new_obs, at_indices=-1)
|
| 78 |
+
|
| 79 |
+
# Return `data` as-is.
|
| 80 |
+
return batch
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Placeholder for training with count-based curiosity.
|
| 2 |
+
|
| 3 |
+
The actual script can be found at a different location (see code below).
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
if __name__ == "__main__":
|
| 7 |
+
import subprocess
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
# Forward to "python ../curiosity/[same script name].py [same options]"
|
| 11 |
+
command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]
|
| 12 |
+
|
| 13 |
+
# Run the script.
|
| 14 |
+
subprocess.run(command, capture_output=True)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Placeholder for training with euclidian distance-based curiosity.
|
| 2 |
+
|
| 3 |
+
The actual script can be found at a different location (see code below).
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
if __name__ == "__main__":
|
| 7 |
+
import subprocess
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
# Forward to "python ../curiosity/[same script name].py [same options]"
|
| 11 |
+
command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]
|
| 12 |
+
|
| 13 |
+
# Run the script.
|
| 14 |
+
subprocess.run(command, capture_output=True)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example using a ConnectorV2 to flatten arbitrarily nested dict or tuple observations.
|
| 2 |
+
|
| 3 |
+
An RLlib Algorithm has 3 distinct connector pipelines:
|
| 4 |
+
- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
|
| 5 |
+
a batch for an RLModule to compute actions (`forward_inference()` or
|
| 6 |
+
`forward_exploration()`).
|
| 7 |
+
- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
|
| 8 |
+
it into an action readable by the environment.
|
| 9 |
+
- A learner connector pipeline on a Learner taking a list of episodes and producing
|
| 10 |
+
a batch for an RLModule to perform the training forward pass (`forward_train()`).
|
| 11 |
+
|
| 12 |
+
Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
|
| 13 |
+
adds/prepends to these pipelines in order to perform the most basic functionalities.
|
| 14 |
+
For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
|
| 15 |
+
env-to-module pipeline to make sure the batch for computing actions contains - at the
|
| 16 |
+
minimum - the most recent observation.
|
| 17 |
+
|
| 18 |
+
On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
|
| 19 |
+
pieces (or use the ones available already in RLlib) and add them to one of the 3
|
| 20 |
+
different pipelines described above, as required.
|
| 21 |
+
|
| 22 |
+
This example:
|
| 23 |
+
- shows how the `FlattenObservation` ConnectorV2 piece can be added to the
|
| 24 |
+
env-to-module pipeline.
|
| 25 |
+
- demonstrates that by using this connector, any arbitrarily nested dict or tuple
|
| 26 |
+
observations is properly flattened into a simple 1D tensor, for easier RLModule
|
| 27 |
+
processing.
|
| 28 |
+
- shows how - in a multi-agent setup - individual agents can be specified, whose
|
| 29 |
+
observations should be flattened (while other agents' observations will always
|
| 30 |
+
be left as-is).
|
| 31 |
+
- uses a variant of the CartPole-v1 environment, in which the 4 observation items
|
| 32 |
+
(x-pos, x-veloc, angle, and angle-veloc) are taken apart and put into a nested dict
|
| 33 |
+
with the structure:
|
| 34 |
+
{
|
| 35 |
+
"x-pos": [x-pos],
|
| 36 |
+
"angular-pos": {
|
| 37 |
+
"value": [angle],
|
| 38 |
+
"some_random_stuff": [random Discrete(3)], # <- should be ignored by algo
|
| 39 |
+
},
|
| 40 |
+
"velocs": Tuple([x-veloc], [angle-veloc]),
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
How to run this script
|
| 45 |
+
----------------------
|
| 46 |
+
`python [script file name].py --enable-new-api-stack`
|
| 47 |
+
|
| 48 |
+
For debugging, use the following additional command line options
|
| 49 |
+
`--no-tune --num-env-runners=0`
|
| 50 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 51 |
+
have the execution stop there for inspection and debugging.
|
| 52 |
+
|
| 53 |
+
For logging to your WandB account, use:
|
| 54 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 55 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
Results to expect
|
| 59 |
+
-----------------
|
| 60 |
+
|
| 61 |
+
+---------------------+------------+----------------+--------+------------------+
|
| 62 |
+
| Trial name | status | loc | iter | total time (s) |
|
| 63 |
+
| | | | | |
|
| 64 |
+
|---------------------+------------+----------------+--------+------------------+
|
| 65 |
+
| PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 | 25 | 24.1426 |
|
| 66 |
+
+---------------------+------------+----------------+--------+------------------+
|
| 67 |
+
+------------------------+------------------------+------------------------+
|
| 68 |
+
| num_env_steps_sample | num_env_steps_traine | episode_return_mean |
|
| 69 |
+
| d_lifetime | d_lifetime | |
|
| 70 |
+
+------------------------+------------------------+------------------------|
|
| 71 |
+
| 100000 | 100000 | 421.42 |
|
| 72 |
+
+------------------------+------------------------+------------------------+
|
| 73 |
+
"""
|
| 74 |
+
from ray.tune.registry import register_env
|
| 75 |
+
from ray.rllib.connectors.env_to_module import FlattenObservations
|
| 76 |
+
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
|
| 77 |
+
from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import (
|
| 78 |
+
CartPoleWithDictObservationSpace,
|
| 79 |
+
)
|
| 80 |
+
from ray.rllib.examples.envs.classes.multi_agent import (
|
| 81 |
+
MultiAgentCartPoleWithDictObservationSpace,
|
| 82 |
+
)
|
| 83 |
+
from ray.rllib.utils.test_utils import (
|
| 84 |
+
add_rllib_example_script_args,
|
| 85 |
+
run_rllib_example_script_experiment,
|
| 86 |
+
)
|
| 87 |
+
from ray.tune.registry import get_trainable_cls
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# Read in common example script command line arguments.
|
| 91 |
+
parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0)
|
| 92 |
+
parser.set_defaults(enable_new_api_stack=True)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
if __name__ == "__main__":
|
| 96 |
+
args = parser.parse_args()
|
| 97 |
+
|
| 98 |
+
# Define env-to-module-connector pipeline for the new stack.
|
| 99 |
+
def _env_to_module_pipeline(env):
|
| 100 |
+
return FlattenObservations(multi_agent=args.num_agents > 0)
|
| 101 |
+
|
| 102 |
+
# Register our environment with tune.
|
| 103 |
+
if args.num_agents > 0:
|
| 104 |
+
register_env(
|
| 105 |
+
"env",
|
| 106 |
+
lambda _: MultiAgentCartPoleWithDictObservationSpace(
|
| 107 |
+
config={"num_agents": args.num_agents}
|
| 108 |
+
),
|
| 109 |
+
)
|
| 110 |
+
else:
|
| 111 |
+
register_env("env", lambda _: CartPoleWithDictObservationSpace())
|
| 112 |
+
|
| 113 |
+
# Define the AlgorithmConfig used.
|
| 114 |
+
base_config = (
|
| 115 |
+
get_trainable_cls(args.algo)
|
| 116 |
+
.get_default_config()
|
| 117 |
+
.environment("env")
|
| 118 |
+
.env_runners(env_to_module_connector=_env_to_module_pipeline)
|
| 119 |
+
.training(
|
| 120 |
+
gamma=0.99,
|
| 121 |
+
lr=0.0003,
|
| 122 |
+
)
|
| 123 |
+
.rl_module(
|
| 124 |
+
model_config=DefaultModelConfig(
|
| 125 |
+
fcnet_hiddens=[32],
|
| 126 |
+
fcnet_activation="linear",
|
| 127 |
+
vf_share_layers=True,
|
| 128 |
+
),
|
| 129 |
+
)
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Add a simple multi-agent setup.
|
| 133 |
+
if args.num_agents > 0:
|
| 134 |
+
base_config.multi_agent(
|
| 135 |
+
policies={f"p{i}" for i in range(args.num_agents)},
|
| 136 |
+
policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# PPO-specific settings (for better learning behavior only).
|
| 140 |
+
if args.algo == "PPO":
|
| 141 |
+
base_config.training(
|
| 142 |
+
num_epochs=6,
|
| 143 |
+
vf_loss_coeff=0.01,
|
| 144 |
+
)
|
| 145 |
+
# IMPALA-specific settings (for better learning behavior only).
|
| 146 |
+
elif args.algo == "IMPALA":
|
| 147 |
+
base_config.training(
|
| 148 |
+
lr=0.0005,
|
| 149 |
+
vf_loss_coeff=0.05,
|
| 150 |
+
entropy_coeff=0.0,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Run everything as configured.
|
| 154 |
+
run_rllib_example_script_experiment(base_config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example using 2 ConnectorV2 for observation frame-stacking in Atari environments.
|
| 2 |
+
|
| 3 |
+
An RLlib Algorithm has 3 distinct connector pipelines:
|
| 4 |
+
- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
|
| 5 |
+
a batch for an RLModule to compute actions (`forward_inference()` or
|
| 6 |
+
`forward_exploration()`).
|
| 7 |
+
- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
|
| 8 |
+
it into an action readable by the environment.
|
| 9 |
+
- A learner connector pipeline on a Learner taking a list of episodes and producing
|
| 10 |
+
a batch for an RLModule to perform the training forward pass (`forward_train()`).
|
| 11 |
+
|
| 12 |
+
Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
|
| 13 |
+
adds/prepends to these pipelines in order to perform the most basic functionalities.
|
| 14 |
+
For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
|
| 15 |
+
env-to-module pipeline to make sure the batch for computing actions contains - at the
|
| 16 |
+
minimum - the most recent observation.
|
| 17 |
+
|
| 18 |
+
On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
|
| 19 |
+
pieces (or use the ones available already in RLlib) and add them to one of the 3
|
| 20 |
+
different pipelines described above, as required.
|
| 21 |
+
|
| 22 |
+
This example:
|
| 23 |
+
- shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
|
| 24 |
+
env-to-module pipeline.
|
| 25 |
+
- shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
|
| 26 |
+
learner connector pipeline.
|
| 27 |
+
- demonstrates that using these two pieces (rather than performing framestacking
|
| 28 |
+
already inside the environment using a gymnasium wrapper) increases overall
|
| 29 |
+
performance by about 5%.
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
How to run this script
|
| 33 |
+
----------------------
|
| 34 |
+
`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
|
| 35 |
+
|
| 36 |
+
Use the `--num-frames` option to define the number of observations to framestack.
|
| 37 |
+
If you don't want to use Connectors to perform the framestacking, set the
|
| 38 |
+
`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
|
| 39 |
+
gymnasium observation wrapper. In this case though, be aware that the tensors being
|
| 40 |
+
sent through the network are `--num-frames` x larger than if you use the Connector
|
| 41 |
+
setup.
|
| 42 |
+
|
| 43 |
+
For debugging, use the following additional command line options
|
| 44 |
+
`--no-tune --num-env-runners=0`
|
| 45 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 46 |
+
have the execution stop there for inspection and debugging.
|
| 47 |
+
|
| 48 |
+
For logging to your WandB account, use:
|
| 49 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 50 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
Results to expect
|
| 54 |
+
-----------------
|
| 55 |
+
|
| 56 |
+
With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
|
| 57 |
+
and learner connector pipelines), you should see something like this using:
|
| 58 |
+
`--env ALE/Pong-v5 --num-learners=4 --num-gpus-per-learner=1 --num-env-runners=95`
|
| 59 |
+
+---------------------------+------------+--------+------------------+...
|
| 60 |
+
| Trial name | status | iter | total time (s) |
|
| 61 |
+
| | | | |
|
| 62 |
+
|---------------------------+------------+--------+------------------+...
|
| 63 |
+
| PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 335.837 |
|
| 64 |
+
+---------------------------+------------+--------+------------------+...
|
| 65 |
+
|
| 66 |
+
Note that the time to run these 200 iterations is about ~5% faster than when
|
| 67 |
+
performing framestacking already inside the environment (using a
|
| 68 |
+
`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
|
| 69 |
+
needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
|
| 70 |
+
|
| 71 |
+
Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal),
|
| 72 |
+
the output looks like this:
|
| 73 |
+
+---------------------------+------------+--------+------------------+...
|
| 74 |
+
| Trial name | status | iter | total time (s) |
|
| 75 |
+
| | | | |
|
| 76 |
+
|---------------------------+------------+--------+------------------+...
|
| 77 |
+
| PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 351.505 |
|
| 78 |
+
+---------------------------+------------+--------+------------------+...
|
| 79 |
+
"""
|
| 80 |
+
import gymnasium as gym
|
| 81 |
+
|
| 82 |
+
from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
|
| 83 |
+
from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
|
| 84 |
+
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
|
| 85 |
+
from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
|
| 86 |
+
from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent
|
| 87 |
+
from ray.rllib.utils.test_utils import (
|
| 88 |
+
add_rllib_example_script_args,
|
| 89 |
+
run_rllib_example_script_experiment,
|
| 90 |
+
)
|
| 91 |
+
from ray.tune.registry import get_trainable_cls
|
| 92 |
+
|
| 93 |
+
# Read in common example script command line arguments.
|
| 94 |
+
parser = add_rllib_example_script_args(
|
| 95 |
+
default_timesteps=5000000, default_reward=20.0, default_iters=200
|
| 96 |
+
)
|
| 97 |
+
# Use Pong by default.
|
| 98 |
+
parser.set_defaults(
|
| 99 |
+
enable_new_api_stack=True,
|
| 100 |
+
env="ale_py:ALE/Pong-v5",
|
| 101 |
+
)
|
| 102 |
+
parser.add_argument(
|
| 103 |
+
"--num-frames",
|
| 104 |
+
type=int,
|
| 105 |
+
default=4,
|
| 106 |
+
help="The number of observation frames to stack.",
|
| 107 |
+
)
|
| 108 |
+
parser.add_argument(
|
| 109 |
+
"--use-gym-wrapper-framestacking",
|
| 110 |
+
action="store_true",
|
| 111 |
+
help="Whether to use RLlib's Atari wrapper's framestacking capabilities (as "
|
| 112 |
+
"opposed to doing it via a specific ConenctorV2 pipeline).",
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
from ray import tune
|
| 118 |
+
|
| 119 |
+
args = parser.parse_args()
|
| 120 |
+
|
| 121 |
+
# Define our custom connector pipelines.
|
| 122 |
+
def _make_env_to_module_connector(env):
|
| 123 |
+
# Create the env-to-module connector. We return an individual connector piece
|
| 124 |
+
# here, which RLlib automatically integrates into a pipeline (and
|
| 125 |
+
# add its default connector piece to the end of that pipeline).
|
| 126 |
+
# The default pipeline automatically fixes the input- and output spaces of the
|
| 127 |
+
# individual connector pieces in it.
|
| 128 |
+
# Note that since the frame stacking connector does NOT write information
|
| 129 |
+
# back to the episode (in order to save memory and network traffic), we
|
| 130 |
+
# also need to perform the same procedure on the Learner end (see below
|
| 131 |
+
# where we set up the Learner pipeline).
|
| 132 |
+
return FrameStackingEnvToModule(
|
| 133 |
+
num_frames=args.num_frames,
|
| 134 |
+
multi_agent=args.num_agents > 0,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
def _make_learner_connector(input_observation_space, input_action_space):
|
| 138 |
+
# Create the learner connector.
|
| 139 |
+
return FrameStackingLearner(
|
| 140 |
+
num_frames=args.num_frames,
|
| 141 |
+
multi_agent=args.num_agents > 0,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it).
|
| 145 |
+
# We would like our frame stacking connector to do this job.
|
| 146 |
+
def _env_creator(cfg):
|
| 147 |
+
return wrap_atari_for_new_api_stack(
|
| 148 |
+
gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}),
|
| 149 |
+
# Perform framestacking either through ConnectorV2 or right here through
|
| 150 |
+
# the observation wrapper.
|
| 151 |
+
framestack=(
|
| 152 |
+
args.num_frames if args.use_gym_wrapper_framestacking else None
|
| 153 |
+
),
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
if args.num_agents > 0:
|
| 157 |
+
tune.register_env(
|
| 158 |
+
"atari-env",
|
| 159 |
+
lambda cfg: make_multi_agent(_env_creator)(
|
| 160 |
+
dict(cfg, **{"num_agents": args.num_agents})
|
| 161 |
+
),
|
| 162 |
+
)
|
| 163 |
+
else:
|
| 164 |
+
tune.register_env("atari-env", _env_creator)
|
| 165 |
+
|
| 166 |
+
base_config = (
|
| 167 |
+
get_trainable_cls(args.algo)
|
| 168 |
+
.get_default_config()
|
| 169 |
+
.environment(
|
| 170 |
+
"atari-env",
|
| 171 |
+
env_config={
|
| 172 |
+
# Make analogous to old v4 + NoFrameskip.
|
| 173 |
+
"frameskip": 1,
|
| 174 |
+
"full_action_space": False,
|
| 175 |
+
"repeat_action_probability": 0.0,
|
| 176 |
+
},
|
| 177 |
+
clip_rewards=True,
|
| 178 |
+
)
|
| 179 |
+
.env_runners(
|
| 180 |
+
# ... new EnvRunner and our frame stacking env-to-module connector.
|
| 181 |
+
env_to_module_connector=(
|
| 182 |
+
None
|
| 183 |
+
if args.use_gym_wrapper_framestacking
|
| 184 |
+
else _make_env_to_module_connector
|
| 185 |
+
),
|
| 186 |
+
num_envs_per_env_runner=1 if args.num_agents > 0 else 2,
|
| 187 |
+
)
|
| 188 |
+
.training(
|
| 189 |
+
# Use our frame stacking learner connector.
|
| 190 |
+
learner_connector=(
|
| 191 |
+
None if args.use_gym_wrapper_framestacking else _make_learner_connector
|
| 192 |
+
),
|
| 193 |
+
entropy_coeff=0.01,
|
| 194 |
+
# Linearly adjust learning rate based on number of GPUs.
|
| 195 |
+
lr=0.00015 * (args.num_learners or 1),
|
| 196 |
+
grad_clip=100.0,
|
| 197 |
+
grad_clip_by="global_norm",
|
| 198 |
+
)
|
| 199 |
+
.rl_module(
|
| 200 |
+
model_config=DefaultModelConfig(
|
| 201 |
+
vf_share_layers=True,
|
| 202 |
+
conv_filters=[(16, 4, 2), (32, 4, 2), (64, 4, 2), (128, 4, 2)],
|
| 203 |
+
conv_activation="relu",
|
| 204 |
+
head_fcnet_hiddens=[256],
|
| 205 |
+
),
|
| 206 |
+
)
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
# PPO specific settings.
|
| 210 |
+
if args.algo == "PPO":
|
| 211 |
+
base_config.training(
|
| 212 |
+
num_epochs=10,
|
| 213 |
+
minibatch_size=64,
|
| 214 |
+
lambda_=0.95,
|
| 215 |
+
kl_coeff=0.5,
|
| 216 |
+
clip_param=0.1,
|
| 217 |
+
vf_clip_param=10.0,
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
# Add a simple multi-agent setup.
|
| 221 |
+
if args.num_agents > 0:
|
| 222 |
+
base_config.multi_agent(
|
| 223 |
+
policies={f"p{i}" for i in range(args.num_agents)},
|
| 224 |
+
policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
# Run everything as configured.
|
| 228 |
+
run_rllib_example_script_experiment(base_config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example using a ConnectorV2 for processing observations with a mean/std filter.
|
| 2 |
+
|
| 3 |
+
An RLlib Algorithm has 3 distinct connector pipelines:
|
| 4 |
+
- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
|
| 5 |
+
a batch for an RLModule to compute actions (`forward_inference()` or
|
| 6 |
+
`forward_exploration()`).
|
| 7 |
+
- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
|
| 8 |
+
it into an action readable by the environment.
|
| 9 |
+
- A learner connector pipeline on a Learner taking a list of episodes and producing
|
| 10 |
+
a batch for an RLModule to perform the training forward pass (`forward_train()`).
|
| 11 |
+
|
| 12 |
+
Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
|
| 13 |
+
adds/prepends to these pipelines in order to perform the most basic functionalities.
|
| 14 |
+
For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
|
| 15 |
+
env-to-module pipeline to make sure the batch for computing actions contains - at the
|
| 16 |
+
minimum - the most recent observation.
|
| 17 |
+
|
| 18 |
+
On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
|
| 19 |
+
pieces (or use the ones available already in RLlib) and add them to one of the 3
|
| 20 |
+
different pipelines described above, as required.
|
| 21 |
+
|
| 22 |
+
This example:
|
| 23 |
+
- shows how the `MeanStdFilter` ConnectorV2 piece can be added to the env-to-module
|
| 24 |
+
pipeline.
|
| 25 |
+
- demonstrates that using such a filter enhances learning behavior (or even makes
|
| 26 |
+
if possible to learn overall) in some environments, especially those with lopsided
|
| 27 |
+
observation spaces, for example `Box(-3000, -1000, ...)`.
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
How to run this script
|
| 31 |
+
----------------------
|
| 32 |
+
`python [script file name].py --enable-new-api-stack`
|
| 33 |
+
|
| 34 |
+
For debugging, use the following additional command line options
|
| 35 |
+
`--no-tune --num-env-runners=0`
|
| 36 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 37 |
+
have the execution stop there for inspection and debugging.
|
| 38 |
+
|
| 39 |
+
For logging to your WandB account, use:
|
| 40 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 41 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
Results to expect
|
| 45 |
+
-----------------
|
| 46 |
+
Running this example with the mean-std filter results in the normally expected Pendulum
|
| 47 |
+
learning behavior:
|
| 48 |
+
+-------------------------------+------------+-----------------+--------+
|
| 49 |
+
| Trial name | status | loc | iter |
|
| 50 |
+
| | | | |
|
| 51 |
+
|-------------------------------+------------+-----------------+--------+
|
| 52 |
+
| PPO_lopsided-pend_f9c96_00000 | TERMINATED | 127.0.0.1:43612 | 77 |
|
| 53 |
+
+-------------------------------+------------+-----------------+--------+
|
| 54 |
+
+------------------+------------------------+-----------------------+
|
| 55 |
+
| total time (s) | num_env_steps_sample | episode_return_mean |
|
| 56 |
+
| | d_lifetime | |
|
| 57 |
+
|------------------+------------------------+-----------------------|
|
| 58 |
+
| 30.7466 | 40040 | -276.3 |
|
| 59 |
+
+------------------+------------------------+-----------------------+
|
| 60 |
+
|
| 61 |
+
If you try using the `--disable-mean-std-filter` (all other things being equal), you
|
| 62 |
+
will either see no learning progress at all (or a very slow one), but more likely some
|
| 63 |
+
numerical instability related error will be thrown:
|
| 64 |
+
|
| 65 |
+
ValueError: Expected parameter loc (Tensor of shape (64, 1)) of distribution
|
| 66 |
+
Normal(loc: torch.Size([64, 1]), scale: torch.Size([64, 1])) to satisfy the
|
| 67 |
+
constraint Real(), but found invalid values:
|
| 68 |
+
tensor([[nan],
|
| 69 |
+
[nan],
|
| 70 |
+
[nan],
|
| 71 |
+
...
|
| 72 |
+
"""
|
| 73 |
+
import gymnasium as gym
|
| 74 |
+
import numpy as np
|
| 75 |
+
|
| 76 |
+
from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter
|
| 77 |
+
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
|
| 78 |
+
from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
|
| 79 |
+
from ray.rllib.utils.framework import try_import_torch
|
| 80 |
+
from ray.rllib.utils.test_utils import (
|
| 81 |
+
add_rllib_example_script_args,
|
| 82 |
+
run_rllib_example_script_experiment,
|
| 83 |
+
)
|
| 84 |
+
from ray.tune.registry import get_trainable_cls, register_env
|
| 85 |
+
|
| 86 |
+
torch, _ = try_import_torch()
|
| 87 |
+
|
| 88 |
+
parser = add_rllib_example_script_args(
|
| 89 |
+
default_iters=500,
|
| 90 |
+
default_timesteps=500000,
|
| 91 |
+
default_reward=-300.0,
|
| 92 |
+
)
|
| 93 |
+
parser.add_argument(
|
| 94 |
+
"--disable-mean-std-filter",
|
| 95 |
+
action="store_true",
|
| 96 |
+
help="Run w/o a mean/std env-to-module connector piece (filter).",
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class LopsidedObs(gym.ObservationWrapper):
|
| 101 |
+
def __init__(self, env):
|
| 102 |
+
super().__init__(env)
|
| 103 |
+
self.observation_space = gym.spaces.Box(-4000.0, -1456.0, (3,), np.float32)
|
| 104 |
+
|
| 105 |
+
def observation(self, observation):
|
| 106 |
+
# Lopside [-1.0, 1.0] Pendulum observations
|
| 107 |
+
return ((observation + 1.0) / 2.0) * (4000.0 - 1456.0) - 4000.0
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
if __name__ == "__main__":
|
| 111 |
+
args = parser.parse_args()
|
| 112 |
+
|
| 113 |
+
assert (
|
| 114 |
+
args.enable_new_api_stack
|
| 115 |
+
), "Must set --enable-new-api-stack when running this script!"
|
| 116 |
+
|
| 117 |
+
# Register our environment with tune.
|
| 118 |
+
if args.num_agents > 0:
|
| 119 |
+
register_env(
|
| 120 |
+
"lopsided-pend",
|
| 121 |
+
lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}),
|
| 122 |
+
)
|
| 123 |
+
else:
|
| 124 |
+
register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1")))
|
| 125 |
+
|
| 126 |
+
base_config = (
|
| 127 |
+
get_trainable_cls(args.algo)
|
| 128 |
+
.get_default_config()
|
| 129 |
+
.environment("lopsided-pend")
|
| 130 |
+
.env_runners(
|
| 131 |
+
# TODO (sven): MAEnvRunner does not support vectorized envs yet
|
| 132 |
+
# due to gym's env checkers and non-compatability with RLlib's
|
| 133 |
+
# MultiAgentEnv API.
|
| 134 |
+
num_envs_per_env_runner=1 if args.num_agents > 0 else 20,
|
| 135 |
+
# Define a single connector piece to be prepended to the env-to-module
|
| 136 |
+
# connector pipeline.
|
| 137 |
+
# Alternatively, return a list of n ConnectorV2 pieces (which will then be
|
| 138 |
+
# included in an automatically generated EnvToModulePipeline or return a
|
| 139 |
+
# EnvToModulePipeline directly.
|
| 140 |
+
env_to_module_connector=(
|
| 141 |
+
None
|
| 142 |
+
if args.disable_mean_std_filter
|
| 143 |
+
else lambda env: MeanStdFilter(multi_agent=args.num_agents > 0)
|
| 144 |
+
),
|
| 145 |
+
)
|
| 146 |
+
.training(
|
| 147 |
+
train_batch_size_per_learner=512,
|
| 148 |
+
gamma=0.95,
|
| 149 |
+
# Linearly adjust learning rate based on number of GPUs.
|
| 150 |
+
lr=0.0003 * (args.num_learners or 1),
|
| 151 |
+
vf_loss_coeff=0.01,
|
| 152 |
+
)
|
| 153 |
+
.rl_module(
|
| 154 |
+
model_config=DefaultModelConfig(
|
| 155 |
+
fcnet_activation="relu",
|
| 156 |
+
fcnet_kernel_initializer=torch.nn.init.xavier_uniform_,
|
| 157 |
+
fcnet_bias_initializer=torch.nn.init.constant_,
|
| 158 |
+
fcnet_bias_initializer_kwargs={"val": 0.0},
|
| 159 |
+
),
|
| 160 |
+
)
|
| 161 |
+
# In case you would like to run with a evaluation EnvRunners, make sure your
|
| 162 |
+
# `evaluation_config` key contains the `use_worker_filter_stats=False` setting
|
| 163 |
+
# (see below). This setting makes sure that the mean/std stats collected by the
|
| 164 |
+
# evaluation EnvRunners are NOT used for the training EnvRunners (unless you
|
| 165 |
+
# really want to mix these stats). It's normally a good idea to keep the stats
|
| 166 |
+
# collected during evaluation completely out of the training data (already for
|
| 167 |
+
# better reproducibility alone).
|
| 168 |
+
# .evaluation(
|
| 169 |
+
# evaluation_num_env_runners=1,
|
| 170 |
+
# evaluation_interval=1,
|
| 171 |
+
# evaluation_config={
|
| 172 |
+
# "explore": False,
|
| 173 |
+
# # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before
|
| 174 |
+
# # each round of evaluation, broadcast the latest training
|
| 175 |
+
# # EnvRunnerGroup's ConnectorV2 states (merged from all training remote
|
| 176 |
+
# # EnvRunners) to the eval EnvRunnerGroup (and discard the eval
|
| 177 |
+
# # EnvRunners' stats).
|
| 178 |
+
# "use_worker_filter_stats": False,
|
| 179 |
+
# },
|
| 180 |
+
# )
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
# PPO specific settings.
|
| 184 |
+
if args.algo == "PPO":
|
| 185 |
+
base_config.training(
|
| 186 |
+
minibatch_size=64,
|
| 187 |
+
lambda_=0.1,
|
| 188 |
+
vf_clip_param=10.0,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# Add a simple multi-agent setup.
|
| 192 |
+
if args.num_agents > 0:
|
| 193 |
+
base_config.multi_agent(
|
| 194 |
+
policies={f"p{i}" for i in range(args.num_agents)},
|
| 195 |
+
policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
run_rllib_example_script_experiment(base_config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example using a ConnectorV2 to add previous rewards/actions to an RLModule's input.
|
| 2 |
+
|
| 3 |
+
An RLlib Algorithm has 3 distinct connector pipelines:
|
| 4 |
+
- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
|
| 5 |
+
a batch for an RLModule to compute actions (`forward_inference()` or
|
| 6 |
+
`forward_exploration()`).
|
| 7 |
+
- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
|
| 8 |
+
it into an action readable by the environment.
|
| 9 |
+
- A learner connector pipeline on a Learner taking a list of episodes and producing
|
| 10 |
+
a batch for an RLModule to perform the training forward pass (`forward_train()`).
|
| 11 |
+
|
| 12 |
+
Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
|
| 13 |
+
adds/prepends to these pipelines in order to perform the most basic functionalities.
|
| 14 |
+
For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
|
| 15 |
+
env-to-module pipeline to make sure the batch for computing actions contains - at the
|
| 16 |
+
minimum - the most recent observation.
|
| 17 |
+
|
| 18 |
+
On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
|
| 19 |
+
pieces (or use the ones available already in RLlib) and add them to one of the 3
|
| 20 |
+
different pipelines described above, as required.
|
| 21 |
+
|
| 22 |
+
This example:
|
| 23 |
+
- shows how the `PrevActionsPrevRewards` ConnectorV2 piece can be added to the
|
| 24 |
+
env-to-module pipeline to extract previous rewards and/or actions from the ongoing
|
| 25 |
+
episodes.
|
| 26 |
+
- shows how this connector creates and wraps this new information (rewards and
|
| 27 |
+
actions) together with the original observations into the RLModule's input dict
|
| 28 |
+
under a new `gym.spaces.Dict` structure (for example, if your observation space
|
| 29 |
+
is `O=Box(shape=(3,))` and you add the most recent 1 reward, the new observation
|
| 30 |
+
space will be `Dict({"_original_obs": O, "prev_n_rewards": Box(shape=())})`.
|
| 31 |
+
- demonstrates how to use RLlib's `FlattenObservations` right after the
|
| 32 |
+
`PrevActionsPrevRewards` to flatten that new dict observation structure again into
|
| 33 |
+
a single 1D tensor.
|
| 34 |
+
- uses the StatelessCartPole environment, a CartPole-v1 derivative that's missing
|
| 35 |
+
both x-veloc and angle-veloc observation components and is therefore non-Markovian
|
| 36 |
+
(only partially observable). An LSTM default model is used for training. Adding
|
| 37 |
+
the additional context to the observations (for example, prev. actions) helps the
|
| 38 |
+
LSTM to more quickly learn in this environment.
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
How to run this script
|
| 42 |
+
----------------------
|
| 43 |
+
`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
|
| 44 |
+
|
| 45 |
+
Use the `--num-frames` option to define the number of observations to framestack.
|
| 46 |
+
If you don't want to use Connectors to perform the framestacking, set the
|
| 47 |
+
`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
|
| 48 |
+
gymnasium observation wrapper. In this case though, be aware that the tensors being
|
| 49 |
+
sent through the network are `--num-frames` x larger than if you use the Connector
|
| 50 |
+
setup.
|
| 51 |
+
|
| 52 |
+
For debugging, use the following additional command line options
|
| 53 |
+
`--no-tune --num-env-runners=0`
|
| 54 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 55 |
+
have the execution stop there for inspection and debugging.
|
| 56 |
+
|
| 57 |
+
For logging to your WandB account, use:
|
| 58 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 59 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
Results to expect
|
| 63 |
+
-----------------
|
| 64 |
+
|
| 65 |
+
You should see something similar to this in your terminal output when running
|
| 66 |
+
ths script as described above:
|
| 67 |
+
|
| 68 |
+
+---------------------+------------+-----------------+--------+------------------+
|
| 69 |
+
| Trial name | status | loc | iter | total time (s) |
|
| 70 |
+
| | | | | |
|
| 71 |
+
|---------------------+------------+-----------------+--------+------------------+
|
| 72 |
+
| PPO_env_0edd2_00000 | TERMINATED | 127.0.0.1:12632 | 17 | 42.6898 |
|
| 73 |
+
+---------------------+------------+-----------------+--------+------------------+
|
| 74 |
+
+------------------------+------------------------+------------------------+
|
| 75 |
+
| num_env_steps_sample | num_env_steps_traine | episode_return_mean |
|
| 76 |
+
| d_lifetime | d_lifetime | |
|
| 77 |
+
|------------------------+------------------------+------------------------|
|
| 78 |
+
| 68000 | 68000 | 205.22 |
|
| 79 |
+
+------------------------+------------------------+------------------------+
|
| 80 |
+
"""
|
| 81 |
+
from ray.rllib.algorithms.ppo import PPOConfig
|
| 82 |
+
from ray.rllib.connectors.env_to_module import (
|
| 83 |
+
FlattenObservations,
|
| 84 |
+
PrevActionsPrevRewards,
|
| 85 |
+
)
|
| 86 |
+
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
|
| 87 |
+
from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
|
| 88 |
+
from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole
|
| 89 |
+
from ray.rllib.utils.framework import try_import_torch
|
| 90 |
+
from ray.rllib.utils.test_utils import (
|
| 91 |
+
add_rllib_example_script_args,
|
| 92 |
+
run_rllib_example_script_experiment,
|
| 93 |
+
)
|
| 94 |
+
from ray.tune import register_env
|
| 95 |
+
|
| 96 |
+
torch, nn = try_import_torch()
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
parser = add_rllib_example_script_args(
|
| 100 |
+
default_reward=200.0, default_timesteps=1000000, default_iters=2000
|
| 101 |
+
)
|
| 102 |
+
parser.set_defaults(enable_new_api_stack=True)
|
| 103 |
+
parser.add_argument("--n-prev-rewards", type=int, default=1)
|
| 104 |
+
parser.add_argument("--n-prev-actions", type=int, default=1)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
args = parser.parse_args()
|
| 109 |
+
|
| 110 |
+
# Define our custom connector pipelines.
|
| 111 |
+
def _env_to_module(env):
|
| 112 |
+
# Create the env-to-module connector pipeline.
|
| 113 |
+
return [
|
| 114 |
+
PrevActionsPrevRewards(
|
| 115 |
+
multi_agent=args.num_agents > 0,
|
| 116 |
+
n_prev_rewards=args.n_prev_rewards,
|
| 117 |
+
n_prev_actions=args.n_prev_actions,
|
| 118 |
+
),
|
| 119 |
+
FlattenObservations(multi_agent=args.num_agents > 0),
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
# Register our environment with tune.
|
| 123 |
+
if args.num_agents > 0:
|
| 124 |
+
register_env(
|
| 125 |
+
"env",
|
| 126 |
+
lambda _: MultiAgentStatelessCartPole(
|
| 127 |
+
config={"num_agents": args.num_agents}
|
| 128 |
+
),
|
| 129 |
+
)
|
| 130 |
+
else:
|
| 131 |
+
register_env("env", lambda _: StatelessCartPole())
|
| 132 |
+
|
| 133 |
+
config = (
|
| 134 |
+
PPOConfig()
|
| 135 |
+
.environment("env")
|
| 136 |
+
.env_runners(env_to_module_connector=_env_to_module)
|
| 137 |
+
.training(
|
| 138 |
+
num_epochs=6,
|
| 139 |
+
lr=0.0003,
|
| 140 |
+
train_batch_size=4000,
|
| 141 |
+
vf_loss_coeff=0.01,
|
| 142 |
+
)
|
| 143 |
+
.rl_module(
|
| 144 |
+
model_config=DefaultModelConfig(
|
| 145 |
+
use_lstm=True,
|
| 146 |
+
max_seq_len=20,
|
| 147 |
+
fcnet_hiddens=[32],
|
| 148 |
+
fcnet_activation="linear",
|
| 149 |
+
fcnet_kernel_initializer=nn.init.xavier_uniform_,
|
| 150 |
+
fcnet_bias_initializer=nn.init.constant_,
|
| 151 |
+
fcnet_bias_initializer_kwargs={"val": 0.0},
|
| 152 |
+
vf_share_layers=True,
|
| 153 |
+
),
|
| 154 |
+
)
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Add a simple multi-agent setup.
|
| 158 |
+
if args.num_agents > 0:
|
| 159 |
+
config = config.multi_agent(
|
| 160 |
+
policies={f"p{i}" for i in range(args.num_agents)},
|
| 161 |
+
policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
run_rllib_example_script_experiment(config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example of customizing the evaluation procedure for an RLlib Algorithm.
|
| 2 |
+
|
| 3 |
+
Note, that you should only choose to provide a custom eval function, in case the already
|
| 4 |
+
built-in eval options are not sufficient. Normally, though, RLlib's eval utilities
|
| 5 |
+
that come with each Algorithm are enough to properly evaluate the learning progress
|
| 6 |
+
of your Algorithm.
|
| 7 |
+
|
| 8 |
+
This script uses the SimpleCorridor environment, a simple 1D gridworld, in which
|
| 9 |
+
the agent can only walk left (action=0) or right (action=1). The goal state is located
|
| 10 |
+
at the end of the (1D) corridor. The env exposes an API to change the length of the
|
| 11 |
+
corridor on-the-fly. We use this API here to extend the size of the corridor for the
|
| 12 |
+
evaluation runs.
|
| 13 |
+
|
| 14 |
+
For demonstration purposes only, we define a simple custom evaluation method that does
|
| 15 |
+
the following:
|
| 16 |
+
- It changes the corridor length of all environments used on the evaluation EnvRunners.
|
| 17 |
+
- It runs a defined number of episodes for evaluation purposes.
|
| 18 |
+
- It collects the metrics from those runs, summarizes these metrics and returns them.
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
How to run this script
|
| 22 |
+
----------------------
|
| 23 |
+
`python [script file name].py --enable-new-api-stack
|
| 24 |
+
|
| 25 |
+
You can switch off custom evaluation (and use RLlib's default evaluation procedure)
|
| 26 |
+
with the `--no-custom-eval` flag.
|
| 27 |
+
|
| 28 |
+
You can switch on parallel evaluation to training using the
|
| 29 |
+
`--evaluation-parallel-to-training` flag. See this example script here:
|
| 30 |
+
https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py # noqa
|
| 31 |
+
for more details on running evaluation parallel to training.
|
| 32 |
+
|
| 33 |
+
For debugging, use the following additional command line options
|
| 34 |
+
`--no-tune --num-env-runners=0`
|
| 35 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 36 |
+
have the execution stop there for inspection and debugging.
|
| 37 |
+
|
| 38 |
+
For logging to your WandB account, use:
|
| 39 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 40 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
Results to expect
|
| 44 |
+
-----------------
|
| 45 |
+
You should see the following (or very similar) console output when running this script.
|
| 46 |
+
Note that for each iteration, due to the definition of our custom evaluation function,
|
| 47 |
+
we run 3 evaluation rounds per single training round.
|
| 48 |
+
|
| 49 |
+
...
|
| 50 |
+
Training iteration 1 -> evaluation round 0
|
| 51 |
+
Training iteration 1 -> evaluation round 1
|
| 52 |
+
Training iteration 1 -> evaluation round 2
|
| 53 |
+
...
|
| 54 |
+
...
|
| 55 |
+
+--------------------------------+------------+-----------------+--------+
|
| 56 |
+
| Trial name | status | loc | iter |
|
| 57 |
+
|--------------------------------+------------+-----------------+--------+
|
| 58 |
+
| PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 | 4 |
|
| 59 |
+
+--------------------------------+------------+-----------------+--------+
|
| 60 |
+
+------------------+-------+----------+--------------------+
|
| 61 |
+
| total time (s) | ts | reward | episode_len_mean |
|
| 62 |
+
|------------------+-------+----------+--------------------|
|
| 63 |
+
| 26.1973 | 16000 | 0.872034 | 13.7966 |
|
| 64 |
+
+------------------+-------+----------+--------------------+
|
| 65 |
+
"""
|
| 66 |
+
from typing import Tuple
|
| 67 |
+
|
| 68 |
+
from ray.air.constants import TRAINING_ITERATION
|
| 69 |
+
from ray.rllib.algorithms.algorithm import Algorithm
|
| 70 |
+
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
|
| 71 |
+
from ray.rllib.env.env_runner_group import EnvRunnerGroup
|
| 72 |
+
from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor
|
| 73 |
+
from ray.rllib.utils.metrics import (
|
| 74 |
+
ENV_RUNNER_RESULTS,
|
| 75 |
+
EVALUATION_RESULTS,
|
| 76 |
+
EPISODE_RETURN_MEAN,
|
| 77 |
+
NUM_ENV_STEPS_SAMPLED_LIFETIME,
|
| 78 |
+
)
|
| 79 |
+
from ray.rllib.utils.test_utils import (
|
| 80 |
+
add_rllib_example_script_args,
|
| 81 |
+
run_rllib_example_script_experiment,
|
| 82 |
+
)
|
| 83 |
+
from ray.rllib.utils.typing import ResultDict
|
| 84 |
+
from ray.tune.registry import get_trainable_cls
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
parser = add_rllib_example_script_args(
|
| 88 |
+
default_iters=50, default_reward=0.7, default_timesteps=50000
|
| 89 |
+
)
|
| 90 |
+
parser.add_argument("--no-custom-eval", action="store_true")
|
| 91 |
+
parser.add_argument("--corridor-length-training", type=int, default=10)
|
| 92 |
+
parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20)
|
| 93 |
+
parser.add_argument("--corridor-length-eval-worker-2", type=int, default=30)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def custom_eval_function(
|
| 97 |
+
algorithm: Algorithm,
|
| 98 |
+
eval_workers: EnvRunnerGroup,
|
| 99 |
+
) -> Tuple[ResultDict, int, int]:
|
| 100 |
+
"""Example of a custom evaluation function.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
algorithm: Algorithm class to evaluate.
|
| 104 |
+
eval_workers: Evaluation EnvRunnerGroup.
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
metrics: Evaluation metrics dict.
|
| 108 |
+
"""
|
| 109 |
+
# Set different env settings for each (eval) EnvRunner. Here we use the EnvRunner's
|
| 110 |
+
# `worker_index` property to figure out the actual length.
|
| 111 |
+
# Loop through all workers and all sub-envs (gym.Env) on each worker and call the
|
| 112 |
+
# `set_corridor_length` method on these.
|
| 113 |
+
eval_workers.foreach_env_runner(
|
| 114 |
+
func=lambda worker: (
|
| 115 |
+
env.unwrapped.set_corridor_length(
|
| 116 |
+
args.corridor_length_eval_worker_1
|
| 117 |
+
if worker.worker_index == 1
|
| 118 |
+
else args.corridor_length_eval_worker_2
|
| 119 |
+
)
|
| 120 |
+
for env in worker.env.unwrapped.envs
|
| 121 |
+
)
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Collect metrics results collected by eval workers in this list for later
|
| 125 |
+
# processing.
|
| 126 |
+
env_runner_metrics = []
|
| 127 |
+
sampled_episodes = []
|
| 128 |
+
# For demonstration purposes, run through some number of evaluation
|
| 129 |
+
# rounds within this one call. Note that this function is called once per
|
| 130 |
+
# training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()`
|
| 131 |
+
# (which can be called manually by the user).
|
| 132 |
+
for i in range(3):
|
| 133 |
+
print(f"Training iteration {algorithm.iteration} -> evaluation round {i}")
|
| 134 |
+
# Sample episodes from the EnvRunners AND have them return only the thus
|
| 135 |
+
# collected metrics.
|
| 136 |
+
episodes_and_metrics_all_env_runners = eval_workers.foreach_env_runner(
|
| 137 |
+
# Return only the metrics, NOT the sampled episodes (we don't need them
|
| 138 |
+
# anymore).
|
| 139 |
+
func=lambda worker: (worker.sample(), worker.get_metrics()),
|
| 140 |
+
local_env_runner=False,
|
| 141 |
+
)
|
| 142 |
+
sampled_episodes.extend(
|
| 143 |
+
eps
|
| 144 |
+
for eps_and_mtrcs in episodes_and_metrics_all_env_runners
|
| 145 |
+
for eps in eps_and_mtrcs[0]
|
| 146 |
+
)
|
| 147 |
+
env_runner_metrics.extend(
|
| 148 |
+
eps_and_mtrcs[1] for eps_and_mtrcs in episodes_and_metrics_all_env_runners
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# You can compute metrics from the episodes manually, or use the Algorithm's
|
| 152 |
+
# convenient MetricsLogger to store all evaluation metrics inside the main
|
| 153 |
+
# algo.
|
| 154 |
+
algorithm.metrics.merge_and_log_n_dicts(
|
| 155 |
+
env_runner_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS)
|
| 156 |
+
)
|
| 157 |
+
eval_results = algorithm.metrics.reduce(
|
| 158 |
+
key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS)
|
| 159 |
+
)
|
| 160 |
+
# Alternatively, you could manually reduce over the n returned `env_runner_metrics`
|
| 161 |
+
# dicts, but this would be much harder as you might not know, which metrics
|
| 162 |
+
# to sum up, which ones to average over, etc..
|
| 163 |
+
|
| 164 |
+
# Compute env and agent steps from sampled episodes.
|
| 165 |
+
env_steps = sum(eps.env_steps() for eps in sampled_episodes)
|
| 166 |
+
agent_steps = sum(eps.agent_steps() for eps in sampled_episodes)
|
| 167 |
+
|
| 168 |
+
return eval_results, env_steps, agent_steps
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
args = parser.parse_args()
|
| 173 |
+
args.local_mode = True
|
| 174 |
+
base_config = (
|
| 175 |
+
get_trainable_cls(args.algo)
|
| 176 |
+
.get_default_config()
|
| 177 |
+
# For training, we use a corridor length of n. For evaluation, we use different
|
| 178 |
+
# values, depending on the eval worker index (1 or 2).
|
| 179 |
+
.environment(
|
| 180 |
+
SimpleCorridor,
|
| 181 |
+
env_config={"corridor_length": args.corridor_length_training},
|
| 182 |
+
)
|
| 183 |
+
.evaluation(
|
| 184 |
+
# Do we use the custom eval function defined above?
|
| 185 |
+
custom_evaluation_function=(
|
| 186 |
+
None if args.no_custom_eval else custom_eval_function
|
| 187 |
+
),
|
| 188 |
+
# Number of eval EnvRunners to use.
|
| 189 |
+
evaluation_num_env_runners=2,
|
| 190 |
+
# Enable evaluation, once per training iteration.
|
| 191 |
+
evaluation_interval=1,
|
| 192 |
+
# Run 10 episodes each time evaluation runs (OR "auto" if parallel to
|
| 193 |
+
# training).
|
| 194 |
+
evaluation_duration="auto" if args.evaluation_parallel_to_training else 10,
|
| 195 |
+
# Evaluate parallelly to training?
|
| 196 |
+
evaluation_parallel_to_training=args.evaluation_parallel_to_training,
|
| 197 |
+
# Override the env settings for the eval workers.
|
| 198 |
+
# Note, though, that this setting here is only used in case --no-custom-eval
|
| 199 |
+
# is set, b/c in case the custom eval function IS used, we override the
|
| 200 |
+
# length of the eval environments in that custom function, so this setting
|
| 201 |
+
# here is simply ignored.
|
| 202 |
+
evaluation_config=AlgorithmConfig.overrides(
|
| 203 |
+
env_config={"corridor_length": args.corridor_length_training * 2},
|
| 204 |
+
# TODO (sven): Add support for window=float(inf) and reduce=mean for
|
| 205 |
+
# evaluation episode_return_mean reductions (identical to old stack
|
| 206 |
+
# behavior, which does NOT use a window (100 by default) to reduce
|
| 207 |
+
# eval episode returns.
|
| 208 |
+
metrics_num_episodes_for_smoothing=5,
|
| 209 |
+
),
|
| 210 |
+
)
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
stop = {
|
| 214 |
+
TRAINING_ITERATION: args.stop_iters,
|
| 215 |
+
f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
|
| 216 |
+
args.stop_reward
|
| 217 |
+
),
|
| 218 |
+
NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
run_rllib_example_script_experiment(
|
| 222 |
+
base_config,
|
| 223 |
+
args,
|
| 224 |
+
stop=stop,
|
| 225 |
+
success_metric={
|
| 226 |
+
f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
|
| 227 |
+
args.stop_reward
|
| 228 |
+
),
|
| 229 |
+
},
|
| 230 |
+
)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (196 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc
ADDED
|
Binary file (12.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc
ADDED
|
Binary file (5.57 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc
ADDED
|
Binary file (3.66 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc
ADDED
|
Binary file (8.71 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example of using float16 precision for training and inference.
|
| 2 |
+
|
| 3 |
+
This example:
|
| 4 |
+
- shows how to write a custom callback for RLlib to convert all RLModules
|
| 5 |
+
(on the EnvRunners and Learners) to float16 precision.
|
| 6 |
+
- shows how to write a custom env-to-module ConnectorV2 piece to convert all
|
| 7 |
+
observations and rewards in the collected trajectories to float16 (numpy) arrays.
|
| 8 |
+
- shows how to write a custom grad scaler for torch that is necessary to stabilize
|
| 9 |
+
learning with float16 weight matrices and gradients. This custom scaler behaves
|
| 10 |
+
exactly like the torch built-in `torch.amp.GradScaler` but also works for float16
|
| 11 |
+
gradients (which the torch built-in one doesn't).
|
| 12 |
+
- shows how to write a custom TorchLearner to change the epsilon setting (to the
|
| 13 |
+
much larger 1e-4 to stabilize learning) on the default optimizer (Adam) registered
|
| 14 |
+
for each RLModule.
|
| 15 |
+
- demonstrates how to plug in all the above custom components into an
|
| 16 |
+
`AlgorithmConfig` instance and start training (and inference) with float16
|
| 17 |
+
precision.
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
How to run this script
|
| 21 |
+
----------------------
|
| 22 |
+
`python [script file name].py --enable-new-api-stack
|
| 23 |
+
|
| 24 |
+
For debugging, use the following additional command line options
|
| 25 |
+
`--no-tune --num-env-runners=0`
|
| 26 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 27 |
+
have the execution stop there for inspection and debugging.
|
| 28 |
+
|
| 29 |
+
For logging to your WandB account, use:
|
| 30 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 31 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 32 |
+
|
| 33 |
+
You can visualize experiment results in ~/ray_results using TensorBoard.
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
Results to expect
|
| 37 |
+
-----------------
|
| 38 |
+
You should see something similar to the following on your terminal, when running this
|
| 39 |
+
script with the above recommended options:
|
| 40 |
+
|
| 41 |
+
+-----------------------------+------------+-----------------+--------+
|
| 42 |
+
| Trial name | status | loc | iter |
|
| 43 |
+
| | | | |
|
| 44 |
+
|-----------------------------+------------+-----------------+--------+
|
| 45 |
+
| PPO_CartPole-v1_437ee_00000 | TERMINATED | 127.0.0.1:81045 | 6 |
|
| 46 |
+
+-----------------------------+------------+-----------------+--------+
|
| 47 |
+
+------------------+------------------------+------------------------+
|
| 48 |
+
| total time (s) | episode_return_mean | num_episodes_lifetime |
|
| 49 |
+
| | | |
|
| 50 |
+
|------------------+------------------------+------------------------+
|
| 51 |
+
| 71.3123 | 153.79 | 358 |
|
| 52 |
+
+------------------+------------------------+------------------------+
|
| 53 |
+
"""
|
| 54 |
+
import gymnasium as gym
|
| 55 |
+
import numpy as np
|
| 56 |
+
import torch
|
| 57 |
+
|
| 58 |
+
from ray.rllib.algorithms.algorithm import Algorithm
|
| 59 |
+
from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
|
| 60 |
+
from ray.rllib.connectors.connector_v2 import ConnectorV2
|
| 61 |
+
from ray.rllib.core.learner.torch.torch_learner import TorchLearner
|
| 62 |
+
from ray.rllib.utils.annotations import override
|
| 63 |
+
from ray.rllib.utils.test_utils import (
|
| 64 |
+
add_rllib_example_script_args,
|
| 65 |
+
run_rllib_example_script_experiment,
|
| 66 |
+
)
|
| 67 |
+
from ray.tune.registry import get_trainable_cls
|
| 68 |
+
|
| 69 |
+
parser = add_rllib_example_script_args(
|
| 70 |
+
default_iters=50, default_reward=150.0, default_timesteps=100000
|
| 71 |
+
)
|
| 72 |
+
parser.set_defaults(
|
| 73 |
+
enable_new_api_stack=True,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def on_algorithm_init(
|
| 78 |
+
algorithm: Algorithm,
|
| 79 |
+
**kwargs,
|
| 80 |
+
) -> None:
|
| 81 |
+
"""Callback making sure that all RLModules in the algo are `half()`'ed."""
|
| 82 |
+
|
| 83 |
+
# Switch all Learner RLModules to float16.
|
| 84 |
+
algorithm.learner_group.foreach_learner(
|
| 85 |
+
lambda learner: learner.module.foreach_module(lambda mid, mod: mod.half())
|
| 86 |
+
)
|
| 87 |
+
# Switch all EnvRunner RLModules (assuming single RLModules) to float16.
|
| 88 |
+
algorithm.env_runner_group.foreach_env_runner(
|
| 89 |
+
lambda env_runner: env_runner.module.half()
|
| 90 |
+
)
|
| 91 |
+
if algorithm.eval_env_runner_group:
|
| 92 |
+
algorithm.eval_env_runner_group.foreach_env_runner(
|
| 93 |
+
lambda env_runner: env_runner.module.half()
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class WriteObsAndRewardsAsFloat16(ConnectorV2):
|
| 98 |
+
"""ConnectorV2 piece preprocessing observations and rewards to be float16.
|
| 99 |
+
|
| 100 |
+
Note that users can also write a gymnasium.Wrapper for observations and rewards
|
| 101 |
+
to achieve the same thing.
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
def recompute_output_observation_space(
|
| 105 |
+
self,
|
| 106 |
+
input_observation_space,
|
| 107 |
+
input_action_space,
|
| 108 |
+
):
|
| 109 |
+
return gym.spaces.Box(
|
| 110 |
+
input_observation_space.low.astype(np.float16),
|
| 111 |
+
input_observation_space.high.astype(np.float16),
|
| 112 |
+
input_observation_space.shape,
|
| 113 |
+
np.float16,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
def __call__(self, *, rl_module, batch, episodes, **kwargs):
|
| 117 |
+
for sa_episode in self.single_agent_episode_iterator(episodes):
|
| 118 |
+
obs = sa_episode.get_observations(-1)
|
| 119 |
+
float16_obs = obs.astype(np.float16)
|
| 120 |
+
sa_episode.set_observations(new_data=float16_obs, at_indices=-1)
|
| 121 |
+
if len(sa_episode) > 0:
|
| 122 |
+
rew = sa_episode.get_rewards(-1).astype(np.float16)
|
| 123 |
+
sa_episode.set_rewards(new_data=rew, at_indices=-1)
|
| 124 |
+
return batch
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class Float16GradScaler:
|
| 128 |
+
"""Custom grad scaler for `TorchLearner`.
|
| 129 |
+
|
| 130 |
+
This class is utilizing the experimental support for the `TorchLearner`'s support
|
| 131 |
+
for loss/gradient scaling (analogous to how a `torch.amp.GradScaler` would work).
|
| 132 |
+
|
| 133 |
+
TorchLearner performs the following steps using this class (`scaler`):
|
| 134 |
+
- loss_per_module = TorchLearner.compute_losses()
|
| 135 |
+
- for L in loss_per_module: L = scaler.scale(L)
|
| 136 |
+
- grads = TorchLearner.compute_gradients() # L.backward() on scaled loss
|
| 137 |
+
- TorchLearner.apply_gradients(grads):
|
| 138 |
+
for optim in optimizers:
|
| 139 |
+
scaler.step(optim) # <- grads should get unscaled
|
| 140 |
+
scaler.update() # <- update scaling factor
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
def __init__(
|
| 144 |
+
self,
|
| 145 |
+
init_scale=1000.0,
|
| 146 |
+
growth_factor=2.0,
|
| 147 |
+
backoff_factor=0.5,
|
| 148 |
+
growth_interval=2000,
|
| 149 |
+
):
|
| 150 |
+
self._scale = init_scale
|
| 151 |
+
self.growth_factor = growth_factor
|
| 152 |
+
self.backoff_factor = backoff_factor
|
| 153 |
+
self.growth_interval = growth_interval
|
| 154 |
+
self._found_inf_or_nan = False
|
| 155 |
+
self.steps_since_growth = 0
|
| 156 |
+
|
| 157 |
+
def scale(self, loss):
|
| 158 |
+
# Scale the loss by `self._scale`.
|
| 159 |
+
return loss * self._scale
|
| 160 |
+
|
| 161 |
+
def get_scale(self):
|
| 162 |
+
return self._scale
|
| 163 |
+
|
| 164 |
+
def step(self, optimizer):
|
| 165 |
+
# Unscale the gradients for all model parameters and apply.
|
| 166 |
+
for group in optimizer.param_groups:
|
| 167 |
+
for param in group["params"]:
|
| 168 |
+
if param.grad is not None:
|
| 169 |
+
param.grad.data.div_(self._scale)
|
| 170 |
+
if torch.isinf(param.grad).any() or torch.isnan(param.grad).any():
|
| 171 |
+
self._found_inf_or_nan = True
|
| 172 |
+
break
|
| 173 |
+
if self._found_inf_or_nan:
|
| 174 |
+
break
|
| 175 |
+
# Only step if no inf/NaN grad found.
|
| 176 |
+
if not self._found_inf_or_nan:
|
| 177 |
+
optimizer.step()
|
| 178 |
+
|
| 179 |
+
def update(self):
|
| 180 |
+
# If gradients are found to be inf/NaN, reduce the scale.
|
| 181 |
+
if self._found_inf_or_nan:
|
| 182 |
+
self._scale *= self.backoff_factor
|
| 183 |
+
self.steps_since_growth = 0
|
| 184 |
+
# Increase the scale after a set number of steps without inf/NaN.
|
| 185 |
+
else:
|
| 186 |
+
self.steps_since_growth += 1
|
| 187 |
+
if self.steps_since_growth >= self.growth_interval:
|
| 188 |
+
self._scale *= self.growth_factor
|
| 189 |
+
self.steps_since_growth = 0
|
| 190 |
+
# Reset inf/NaN flag.
|
| 191 |
+
self._found_inf_or_nan = False
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
class LargeEpsAdamTorchLearner(PPOTorchLearner):
|
| 195 |
+
"""A TorchLearner overriding the default optimizer (Adam) to use non-default eps."""
|
| 196 |
+
|
| 197 |
+
@override(TorchLearner)
|
| 198 |
+
def configure_optimizers_for_module(self, module_id, config):
|
| 199 |
+
"""Registers an Adam optimizer with a larg epsilon under the given module_id."""
|
| 200 |
+
params = list(self._module[module_id].parameters())
|
| 201 |
+
|
| 202 |
+
# Register one Adam optimizer (under the default optimizer name:
|
| 203 |
+
# DEFAULT_OPTIMIZER) for the `module_id`.
|
| 204 |
+
self.register_optimizer(
|
| 205 |
+
module_id=module_id,
|
| 206 |
+
# Create an Adam optimizer with a different eps for better float16
|
| 207 |
+
# stability.
|
| 208 |
+
optimizer=torch.optim.Adam(params, eps=1e-4),
|
| 209 |
+
params=params,
|
| 210 |
+
# Let RLlib handle the learning rate/learning rate schedule.
|
| 211 |
+
# You can leave `lr_or_lr_schedule` at None, but then you should
|
| 212 |
+
# pass a fixed learning rate into the Adam constructor above.
|
| 213 |
+
lr_or_lr_schedule=config.lr,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
if __name__ == "__main__":
|
| 218 |
+
args = parser.parse_args()
|
| 219 |
+
|
| 220 |
+
base_config = (
|
| 221 |
+
get_trainable_cls(args.algo)
|
| 222 |
+
.get_default_config()
|
| 223 |
+
.environment("CartPole-v1")
|
| 224 |
+
# Plug in our custom callback (on_algorithm_init) to make all RLModules
|
| 225 |
+
# float16 models.
|
| 226 |
+
.callbacks(on_algorithm_init=on_algorithm_init)
|
| 227 |
+
# Plug in our custom loss scaler class to stabilize gradient computations
|
| 228 |
+
# (by scaling the loss, then unscaling the gradients before applying them).
|
| 229 |
+
# This is using the built-in, experimental feature of TorchLearner.
|
| 230 |
+
.experimental(_torch_grad_scaler_class=Float16GradScaler)
|
| 231 |
+
# Plug in our custom env-to-module ConnectorV2 piece to convert all observations
|
| 232 |
+
# and reward in the episodes (permanently) to float16.
|
| 233 |
+
.env_runners(env_to_module_connector=lambda env: WriteObsAndRewardsAsFloat16())
|
| 234 |
+
.training(
|
| 235 |
+
# Plug in our custom TorchLearner (using a much larger, stabilizing epsilon
|
| 236 |
+
# on the Adam optimizer).
|
| 237 |
+
learner_class=LargeEpsAdamTorchLearner,
|
| 238 |
+
# Switch off grad clipping entirely b/c we use our custom grad scaler with
|
| 239 |
+
# built-in inf/nan detection (see `step` method of `Float16GradScaler`).
|
| 240 |
+
grad_clip=None,
|
| 241 |
+
# Typical CartPole-v1 hyperparams known to work well:
|
| 242 |
+
gamma=0.99,
|
| 243 |
+
lr=0.0003,
|
| 244 |
+
num_epochs=6,
|
| 245 |
+
vf_loss_coeff=0.01,
|
| 246 |
+
use_kl_loss=True,
|
| 247 |
+
)
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
run_rllib_example_script_experiment(base_config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example of using fractional GPUs (< 1.0) per Learner worker.
|
| 2 |
+
|
| 3 |
+
The number of GPUs required, just for learning (excluding those maybe needed on your
|
| 4 |
+
EnvRunners, if applicable) can be computed by:
|
| 5 |
+
`num_gpus = config.num_learners * config.num_gpus_per_learner`
|
| 6 |
+
|
| 7 |
+
This example:
|
| 8 |
+
- shows how to set up an Algorithm that uses one or more Learner workers ...
|
| 9 |
+
- ... and how to assign a fractional (< 1.0) number of GPUs to each of these Learners.
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
How to run this script
|
| 13 |
+
----------------------
|
| 14 |
+
`python [script file name].py --enable-new-api-stack --num-learners=
|
| 15 |
+
[number of Learners, e.g. 1] --num-gpus-per-learner [some fraction <1.0]`
|
| 16 |
+
|
| 17 |
+
The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU)
|
| 18 |
+
machine.
|
| 19 |
+
Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4
|
| 20 |
+
learning rates in the `base_config` below:
|
| 21 |
+
1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used).
|
| 22 |
+
2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used).
|
| 23 |
+
3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used).
|
| 24 |
+
4) --num-learners=2 --num-gpus-per-learner=1 (8 GPUs used).
|
| 25 |
+
5) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an
|
| 26 |
+
NCCL-related error due to the fact that torch will try to perform DDP sharding,
|
| 27 |
+
but notices that the shards sit on the same GPU).
|
| 28 |
+
|
| 29 |
+
For debugging, use the following additional command line options
|
| 30 |
+
`--no-tune --num-env-runners=0`
|
| 31 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 32 |
+
have the execution stop there for inspection and debugging.
|
| 33 |
+
|
| 34 |
+
Note that the shown GPU settings in this script also work in case you are not
|
| 35 |
+
running via tune, but instead are using the `--no-tune` command line option.
|
| 36 |
+
|
| 37 |
+
For logging to your WandB account, use:
|
| 38 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 39 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 40 |
+
|
| 41 |
+
You can visualize experiment results in ~/ray_results using TensorBoard.
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
Results to expect
|
| 45 |
+
-----------------
|
| 46 |
+
In the console output, you can see that only fractional GPUs are being used by RLlib:
|
| 47 |
+
|
| 48 |
+
== Status ==
|
| 49 |
+
...
|
| 50 |
+
Logical resource usage: 12.0/16 CPUs, 1.0/4 GPUs (...)
|
| 51 |
+
...
|
| 52 |
+
Number of trials: 4/4 (4 RUNNING)
|
| 53 |
+
|
| 54 |
+
The final output should look something like this:
|
| 55 |
+
+-----------------------------+------------+-----------------+--------+--------+
|
| 56 |
+
| Trial name | status | loc | lr | iter |
|
| 57 |
+
| | | | | |
|
| 58 |
+
|-----------------------------+------------+-----------------+--------+--------+
|
| 59 |
+
| PPO_CartPole-v1_7104b_00000 | TERMINATED | 10.0.0.39:31197 | 0.005 | 10 |
|
| 60 |
+
| PPO_CartPole-v1_7104b_00001 | TERMINATED | 10.0.0.39:31202 | 0.003 | 11 |
|
| 61 |
+
| PPO_CartPole-v1_7104b_00002 | TERMINATED | 10.0.0.39:31203 | 0.001 | 10 |
|
| 62 |
+
| PPO_CartPole-v1_7104b_00003 | TERMINATED | 10.0.0.39:31204 | 0.0001 | 11 |
|
| 63 |
+
+-----------------------------+------------+-----------------+--------+--------+
|
| 64 |
+
|
| 65 |
+
+----------------+----------------------+----------------------+----------------------+
|
| 66 |
+
| total time (s) | num_env_steps_sample | num_env_steps_traine | num_episodes_lifetim |
|
| 67 |
+
| | d_lifetime | d_lifetime | e |
|
| 68 |
+
|----------------+----------------------+----------------------+----------------------|
|
| 69 |
+
| 101.002 | 40000 | 40000 | 346 |
|
| 70 |
+
| 110.03 | 44000 | 44000 | 395 |
|
| 71 |
+
| 101.171 | 40000 | 40000 | 328 |
|
| 72 |
+
| 110.091 | 44000 | 44000 | 478 |
|
| 73 |
+
+----------------+----------------------+----------------------+----------------------+
|
| 74 |
+
"""
|
| 75 |
+
from ray import tune
|
| 76 |
+
from ray.rllib.utils.test_utils import (
|
| 77 |
+
add_rllib_example_script_args,
|
| 78 |
+
run_rllib_example_script_experiment,
|
| 79 |
+
)
|
| 80 |
+
from ray.tune.registry import get_trainable_cls
|
| 81 |
+
|
| 82 |
+
parser = add_rllib_example_script_args(
|
| 83 |
+
default_iters=50, default_reward=180, default_timesteps=100000
|
| 84 |
+
)
|
| 85 |
+
parser.set_defaults(
|
| 86 |
+
enable_new_api_stack=True,
|
| 87 |
+
num_env_runners=2,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
if __name__ == "__main__":
|
| 92 |
+
args = parser.parse_args()
|
| 93 |
+
|
| 94 |
+
base_config = (
|
| 95 |
+
get_trainable_cls(args.algo)
|
| 96 |
+
.get_default_config()
|
| 97 |
+
# This script only works on the new API stack.
|
| 98 |
+
.api_stack(
|
| 99 |
+
enable_rl_module_and_learner=True,
|
| 100 |
+
enable_env_runner_and_connector_v2=True,
|
| 101 |
+
)
|
| 102 |
+
.environment("CartPole-v1")
|
| 103 |
+
# Define EnvRunner scaling.
|
| 104 |
+
.env_runners(num_env_runners=args.num_env_runners)
|
| 105 |
+
# Define Learner scaling.
|
| 106 |
+
.learners(
|
| 107 |
+
# How many Learner workers do we need? If you have more than 1 GPU,
|
| 108 |
+
# set this parameter to the number of GPUs available.
|
| 109 |
+
num_learners=args.num_learners,
|
| 110 |
+
# How many GPUs does each Learner need? If you have more than 1 GPU or only
|
| 111 |
+
# one Learner, you should set this to 1, otherwise, set this to some
|
| 112 |
+
# fraction.
|
| 113 |
+
num_gpus_per_learner=args.num_gpus_per_learner,
|
| 114 |
+
)
|
| 115 |
+
# 4 tune trials altogether.
|
| 116 |
+
.training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001]))
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
run_rllib_example_script_experiment(base_config, args, keep_config=True)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example of using GPUs on the EnvRunners (b/c Env and/or RLModule require these).
|
| 2 |
+
|
| 3 |
+
The number of GPUs required, just for your EnvRunners (excluding those needed for
|
| 4 |
+
training your RLModule) can be computed by:
|
| 5 |
+
`num_gpus = config.num_env_runners * config.num_gpus_per_env_runner`
|
| 6 |
+
|
| 7 |
+
This example:
|
| 8 |
+
- shows how to write an Env that uses the GPU.
|
| 9 |
+
- shows how to configure your algorithm such that it allocates any number of GPUs
|
| 10 |
+
(including fractional < 1.0) to each (remote) EnvRunner worker.
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
How to run this script
|
| 14 |
+
----------------------
|
| 15 |
+
`python [script file name].py --enable-new-api-stack --num-env_runners=
|
| 16 |
+
[number of EnvRunners, e.g. 2] --num-gpus-per-env-runner [int or some fraction <1.0]`
|
| 17 |
+
|
| 18 |
+
The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU)
|
| 19 |
+
machine.
|
| 20 |
+
TODO (sven): Fix these
|
| 21 |
+
Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4
|
| 22 |
+
learning rates in the `base_config` below:
|
| 23 |
+
1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used).
|
| 24 |
+
2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used).
|
| 25 |
+
3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used).
|
| 26 |
+
4) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an
|
| 27 |
+
NCCL-related error due to the fact that torch will try to perform DDP sharding,
|
| 28 |
+
but notices that the shards sit on the same GPU).
|
| 29 |
+
|
| 30 |
+
For debugging, use the following additional command line options
|
| 31 |
+
`--no-tune --num-env-runners=0`
|
| 32 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 33 |
+
have the execution stop there for inspection and debugging.
|
| 34 |
+
|
| 35 |
+
Note that the shown GPU settings in this script also work in case you are not
|
| 36 |
+
running via tune, but instead are using the `--no-tune` command line option.
|
| 37 |
+
|
| 38 |
+
For logging to your WandB account, use:
|
| 39 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 40 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 41 |
+
|
| 42 |
+
You can visualize experiment results in ~/ray_results using TensorBoard.
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
Results to expect
|
| 46 |
+
-----------------
|
| 47 |
+
In the console output, you can see that only fractional GPUs are being used by RLlib:
|
| 48 |
+
|
| 49 |
+
"""
|
| 50 |
+
from ray.rllib.examples.envs.classes.gpu_requiring_env import GPURequiringEnv
|
| 51 |
+
from ray.rllib.utils.test_utils import (
|
| 52 |
+
add_rllib_example_script_args,
|
| 53 |
+
run_rllib_example_script_experiment,
|
| 54 |
+
)
|
| 55 |
+
from ray.tune.registry import get_trainable_cls
|
| 56 |
+
|
| 57 |
+
parser = add_rllib_example_script_args(
|
| 58 |
+
default_iters=50, default_reward=0.9, default_timesteps=100000
|
| 59 |
+
)
|
| 60 |
+
parser.set_defaults(
|
| 61 |
+
enable_new_api_stack=True,
|
| 62 |
+
num_env_runners=2,
|
| 63 |
+
)
|
| 64 |
+
parser.add_argument("--num-gpus-per-env-runner", type=float, default=0.5)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
args = parser.parse_args()
|
| 69 |
+
|
| 70 |
+
base_config = (
|
| 71 |
+
get_trainable_cls(args.algo)
|
| 72 |
+
.get_default_config()
|
| 73 |
+
.environment(GPURequiringEnv)
|
| 74 |
+
# Define Learner scaling.
|
| 75 |
+
.env_runners(
|
| 76 |
+
# How many EnvRunner workers do we need?
|
| 77 |
+
num_env_runners=args.num_env_runners,
|
| 78 |
+
# How many GPUs does each EnvRunner require? Note that the memory on (a
|
| 79 |
+
# possibly fractional GPU) must be enough to accommodate the RLModule AND
|
| 80 |
+
# if applicable also the Env's GPU needs).
|
| 81 |
+
num_gpus_per_env_runner=args.num_gpus_per_env_runner,
|
| 82 |
+
)
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
run_rllib_example_script_experiment(base_config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example of using automatic mixed precision training on a torch RLModule.
|
| 2 |
+
|
| 3 |
+
This example:
|
| 4 |
+
- shows how to write a custom callback for RLlib to convert those RLModules
|
| 5 |
+
only(!) on the EnvRunners to float16 precision.
|
| 6 |
+
- shows how to write a custom env-to-module ConnectorV2 piece to add float16
|
| 7 |
+
observations to the action computing forward batch on the EnvRunners, but NOT
|
| 8 |
+
permanently write these changes into the episodes, such that on the
|
| 9 |
+
Learner side, the original float32 observations will be used (for the mixed
|
| 10 |
+
precision `forward_train` and `loss` computations).
|
| 11 |
+
- shows how to plugin torch's built-in `GradScaler` class to be used by the
|
| 12 |
+
TorchLearner to scale losses and unscale gradients in order to gain more stability
|
| 13 |
+
when training with mixed precision.
|
| 14 |
+
- shows how to write a custom TorchLearner to run the update step (overrides
|
| 15 |
+
`_update()`) within a `torch.amp.autocast()` context. This makes sure that .
|
| 16 |
+
- demonstrates how to plug in all the above custom components into an
|
| 17 |
+
`AlgorithmConfig` instance and start training with mixed-precision while
|
| 18 |
+
performing the inference on the EnvRunners with float16 precision.
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
How to run this script
|
| 22 |
+
----------------------
|
| 23 |
+
`python [script file name].py --enable-new-api-stack
|
| 24 |
+
|
| 25 |
+
For debugging, use the following additional command line options
|
| 26 |
+
`--no-tune --num-env-runners=0`
|
| 27 |
+
which should allow you to set breakpoints anywhere in the RLlib code and
|
| 28 |
+
have the execution stop there for inspection and debugging.
|
| 29 |
+
|
| 30 |
+
Note that the shown GPU settings in this script also work in case you are not
|
| 31 |
+
running via tune, but instead are using the `--no-tune` command line option.
|
| 32 |
+
|
| 33 |
+
For logging to your WandB account, use:
|
| 34 |
+
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
|
| 35 |
+
--wandb-run-name=[optional: WandB run name (within the defined project)]`
|
| 36 |
+
|
| 37 |
+
You can visualize experiment results in ~/ray_results using TensorBoard.
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
Results to expect
|
| 41 |
+
-----------------
|
| 42 |
+
In the console output, you should see something like this:
|
| 43 |
+
|
| 44 |
+
+-----------------------------+------------+-----------------+--------+
|
| 45 |
+
| Trial name | status | loc | iter |
|
| 46 |
+
| | | | |
|
| 47 |
+
|-----------------------------+------------+-----------------+--------+
|
| 48 |
+
| PPO_CartPole-v1_485af_00000 | TERMINATED | 127.0.0.1:81045 | 22 |
|
| 49 |
+
+-----------------------------+------------+-----------------+--------+
|
| 50 |
+
+------------------+------------------------+------------------------+
|
| 51 |
+
| total time (s) | episode_return_mean | num_episodes_lifetime |
|
| 52 |
+
| | | |
|
| 53 |
+
|------------------+------------------------+------------------------+
|
| 54 |
+
| 281.3231 | 455.81 | 1426 |
|
| 55 |
+
+------------------+------------------------+------------------------+
|
| 56 |
+
"""
|
| 57 |
+
import gymnasium as gym
|
| 58 |
+
import numpy as np
|
| 59 |
+
import torch
|
| 60 |
+
|
| 61 |
+
from ray.rllib.algorithms.algorithm import Algorithm
|
| 62 |
+
from ray.rllib.algorithms.ppo import PPOConfig
|
| 63 |
+
from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
|
| 64 |
+
from ray.rllib.connectors.connector_v2 import ConnectorV2
|
| 65 |
+
from ray.rllib.utils.test_utils import (
|
| 66 |
+
add_rllib_example_script_args,
|
| 67 |
+
run_rllib_example_script_experiment,
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
parser = add_rllib_example_script_args(
|
| 72 |
+
default_iters=200, default_reward=450.0, default_timesteps=200000
|
| 73 |
+
)
|
| 74 |
+
parser.set_defaults(
|
| 75 |
+
algo="PPO",
|
| 76 |
+
enable_new_api_stack=True,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def on_algorithm_init(
|
| 81 |
+
algorithm: Algorithm,
|
| 82 |
+
**kwargs,
|
| 83 |
+
) -> None:
|
| 84 |
+
"""Callback making sure that all RLModules in the algo are `half()`'ed."""
|
| 85 |
+
|
| 86 |
+
# Switch all EnvRunner RLModules (assuming single RLModules) to float16.
|
| 87 |
+
algorithm.env_runner_group.foreach_env_runner(
|
| 88 |
+
lambda env_runner: env_runner.module.half()
|
| 89 |
+
)
|
| 90 |
+
if algorithm.eval_env_runner_group:
|
| 91 |
+
algorithm.eval_env_runner_group.foreach_env_runner(
|
| 92 |
+
lambda env_runner: env_runner.module.half()
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class Float16Connector(ConnectorV2):
|
| 97 |
+
"""ConnectorV2 piece preprocessing observations and rewards to be float16.
|
| 98 |
+
|
| 99 |
+
Note that users can also write a gymnasium.Wrapper for observations and rewards
|
| 100 |
+
to achieve the same thing.
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
def recompute_output_observation_space(
|
| 104 |
+
self,
|
| 105 |
+
input_observation_space,
|
| 106 |
+
input_action_space,
|
| 107 |
+
):
|
| 108 |
+
return gym.spaces.Box(
|
| 109 |
+
input_observation_space.low.astype(np.float16),
|
| 110 |
+
input_observation_space.high.astype(np.float16),
|
| 111 |
+
input_observation_space.shape,
|
| 112 |
+
np.float16,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
def __call__(self, *, rl_module, batch, episodes, **kwargs):
|
| 116 |
+
for sa_episode in self.single_agent_episode_iterator(episodes):
|
| 117 |
+
obs = sa_episode.get_observations(-1)
|
| 118 |
+
float16_obs = obs.astype(np.float16)
|
| 119 |
+
self.add_batch_item(
|
| 120 |
+
batch,
|
| 121 |
+
column="obs",
|
| 122 |
+
item_to_add=float16_obs,
|
| 123 |
+
single_agent_episode=sa_episode,
|
| 124 |
+
)
|
| 125 |
+
return batch
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
class PPOTorchMixedPrecisionLearner(PPOTorchLearner):
|
| 129 |
+
def _update(self, *args, **kwargs):
|
| 130 |
+
with torch.cuda.amp.autocast():
|
| 131 |
+
results = super()._update(*args, **kwargs)
|
| 132 |
+
return results
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
if __name__ == "__main__":
|
| 136 |
+
args = parser.parse_args()
|
| 137 |
+
|
| 138 |
+
assert (
|
| 139 |
+
args.enable_new_api_stack
|
| 140 |
+
), "Must set --enable-new-api-stack when running this script!"
|
| 141 |
+
assert args.algo == "PPO", "Must set --algo=PPO when running this script!"
|
| 142 |
+
|
| 143 |
+
base_config = (
|
| 144 |
+
(PPOConfig().environment("CartPole-v1"))
|
| 145 |
+
.env_runners(env_to_module_connector=lambda env: Float16Connector())
|
| 146 |
+
# Plug in our custom callback (on_algorithm_init) to make EnvRunner RLModules
|
| 147 |
+
# float16 models.
|
| 148 |
+
.callbacks(on_algorithm_init=on_algorithm_init)
|
| 149 |
+
# Plug in the torch built-int loss scaler class to stabilize gradient
|
| 150 |
+
# computations (by scaling the loss, then unscaling the gradients before
|
| 151 |
+
# applying them). This is using the built-in, experimental feature of
|
| 152 |
+
# TorchLearner.
|
| 153 |
+
.experimental(_torch_grad_scaler_class=torch.cuda.amp.GradScaler)
|
| 154 |
+
.training(
|
| 155 |
+
# Plug in the custom Learner class to activate mixed-precision training for
|
| 156 |
+
# our torch RLModule (uses `torch.amp.autocast()`).
|
| 157 |
+
learner_class=PPOTorchMixedPrecisionLearner,
|
| 158 |
+
# Switch off grad clipping entirely b/c we use our custom grad scaler with
|
| 159 |
+
# built-in inf/nan detection (see `step` method of `Float16GradScaler`).
|
| 160 |
+
grad_clip=None,
|
| 161 |
+
# Typical CartPole-v1 hyperparams known to work well:
|
| 162 |
+
gamma=0.99,
|
| 163 |
+
lr=0.0003,
|
| 164 |
+
num_epochs=6,
|
| 165 |
+
vf_loss_coeff=0.01,
|
| 166 |
+
use_kl_loss=True,
|
| 167 |
+
)
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
run_rllib_example_script_experiment(base_config, args)
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc
ADDED
|
Binary file (3.21 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc
ADDED
|
Binary file (3.08 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc
ADDED
|
Binary file (4.06 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc
ADDED
|
Binary file (4.63 kB). View file
|
|
|