koichi12 commited on Feb 12, 2025

Commit

c84597e

verified ·

1 Parent(s): fb8b131

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py +109 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py +86 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py +92 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py +122 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py +80 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py +14 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py +14 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py +154 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py +228 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py +198 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py +164 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py +230 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py +250 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py +119 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py +85 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py +170 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (191 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/centralized_critic.cpython-311.pyc ADDED Viewed

Binary file (14.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/compute_adapted_gae_on_postprocess_trajectory.cpython-311.pyc ADDED Viewed

Binary file (6.98 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/quadx_waypoints.cpython-311.pyc ADDED Viewed

Binary file (5.44 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/__pycache__/replay_buffer_api.cpython-311.pyc ADDED Viewed

Binary file (3.21 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/policy/__pycache__/cliff_walking_wall_policy.cpython-311.pyc ADDED Viewed

Binary file (6.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (199 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/__pycache__/autoregressive_actions.cpython-311.pyc ADDED Viewed

Binary file (4.83 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/autoregressive_actions.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Example on how to define and run with an RLModule with a dependent action space.
+This examples:
+    - Shows how to write a custom RLModule outputting autoregressive actions.
+    The RLModule class used here implements a prior distribution for the first couple
+    of actions and then uses the sampled actions to compute the parameters for and
+    sample from a posterior distribution.
+    - Shows how to configure a PPO algorithm to use the custom RLModule.
+    - Stops the training after 100k steps or when the mean episode return
+    exceeds -0.012 in evaluation, i.e. if the agent has learned to
+    synchronize its actions.
+For details on the environment used, take a look at the `CorrelatedActionsEnv`
+class. To receive an episode return over 100, the agent must learn how to synchronize
+its actions.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-env-runners 2`
+Control the number of `EnvRunner`s with the `--num-env-runners` flag. This
+will increase the sampling speed.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+You should reach an episode return of better than -0.5 quickly through a simple PPO
+policy. The logic behind beating the env is roughly:
+OBS:  optimal a1:   r1:  optimal a2:   r2:
+-1      2            0      -1.0        0
+-0.5    1/2       -0.5   -0.5/-1.5      0
+0       1            0      -1.0        0
+0.5     0/1       -0.5   -0.5/-1.5      0
+1       0            0      -1.0        0
+Meaning, most of the time, you would receive a reward better than -0.5, but worse than
+0.0.
++--------------------------------------+------------+--------+------------------+
+| Trial name                           | status     |   iter |   total time (s) |
+|                                      |            |        |                  |
+|--------------------------------------+------------+--------+------------------+
+| PPO_CorrelatedActionsEnv_6660d_00000 | TERMINATED |     76 |          132.438 |
++--------------------------------------+------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|    episode_return_mean |   num_env_steps_sample |   ...env_steps_sampled |
+|                        |             d_lifetime |   _lifetime_throughput |
+|------------------------+------------------------+------------------------|
+|                  -0.43 |                 152000 |                1283.48 |
++------------------------+------------------------+------------------------+
+"""
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv
+from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import (
+    AutoregressiveActionsRLM,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+parser = add_rllib_example_script_args(
+    default_iters=1000,
+    default_timesteps=2000000,
+    default_reward=-0.45,
+)
+parser.set_defaults(enable_new_api_stack=True)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.algo != "PPO":
+        raise ValueError(
+            "This example script only runs with PPO! Set --algo=PPO on the command "
+            "line."
+        )
+    base_config = (
+        PPOConfig()
+        .environment(CorrelatedActionsEnv)
+        .training(
+            train_batch_size_per_learner=2000,
+            num_epochs=12,
+            minibatch_size=256,
+            entropy_coeff=0.005,
+            lr=0.0003,
+        )
+        # Specify the RLModule class to be used.
+        .rl_module(
+            rl_module_spec=RLModuleSpec(module_class=AutoregressiveActionsRLM),
+        )
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/actions/nested_action_spaces.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete
+from ray.tune.registry import register_env
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.examples.envs.classes.multi_agent import (
+    MultiAgentNestedSpaceRepeatAfterMeEnv,
+)
+from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import (
+    NestedSpaceRepeatAfterMeEnv,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+# Read in common example script command line arguments.
+parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0)
+parser.set_defaults(enable_new_api_stack=True)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # Define env-to-module-connector pipeline for the new stack.
+    def _env_to_module_pipeline(env):
+        return FlattenObservations(multi_agent=args.num_agents > 0)
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv(
+                config=dict(c, **{"num_agents": args.num_agents})
+            ),
+        )
+    else:
+        register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c))
+    # Define the AlgorithmConfig used.
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "env",
+            env_config={
+                "space": Dict(
+                    {
+                        "a": Tuple(
+                            [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})]
+                        ),
+                        "b": Box(-10.0, 10.0, (2,)),
+                        "c": MultiDiscrete([3, 3]),
+                        "d": Discrete(2),
+                    }
+                ),
+                "episode_len": 100,
+            },
+        )
+        .env_runners(env_to_module_connector=_env_to_module_pipeline)
+        # No history in Env (bandit problem).
+        .training(
+            gamma=0.0,
+            lr=0.0005,
+        )
+    )
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+    # Fix some PPO-specific settings.
+    if args.algo == "PPO":
+        base_config.training(
+            # We don't want high entropy in this Env.
+            entropy_coeff=0.00005,
+            num_epochs=4,
+            vf_loss_coeff=0.01,
+        )
+    # Run everything as configured.
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/count_based_curiosity.cpython-311.pyc ADDED Viewed

Binary file (719 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc ADDED Viewed

Binary file (745 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/flatten_observations_dict_space.cpython-311.pyc ADDED Viewed

Binary file (7.3 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/frame_stacking.cpython-311.pyc ADDED Viewed

Binary file (9.67 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/mean_std_filtering.cpython-311.pyc ADDED Viewed

Binary file (8.85 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/__pycache__/prev_actions_prev_rewards.cpython-311.pyc ADDED Viewed

Binary file (8.54 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (210 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/count_based_curiosity.cpython-311.pyc ADDED Viewed

Binary file (4.54 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/euclidian_distance_based_curiosity.cpython-311.pyc ADDED Viewed

Binary file (5.88 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/__pycache__/protobuf_cartpole_observation_decoder.cpython-311.pyc ADDED Viewed

Binary file (3.86 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/count_based_curiosity.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from collections import Counter
+from typing import Any, List, Optional
+import gymnasium as gym
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.typing import EpisodeType
+class CountBasedCuriosity(ConnectorV2):
+    """Learner ConnectorV2 piece to compute intrinsic rewards based on obs counts.
+    Add this connector piece to your Learner pipeline, through your algo config:
+    ```
+    config.training(
+        learner_connector=lambda obs_sp, act_sp: CountBasedCuriosity()
+    )
+    ```
+    Intrinsic rewards are computed on the Learner side based on naive observation
+    counts, which is why this connector should only be used for simple environments
+    with a reasonable number of possible observations. The intrinsic reward for a given
+    timestep is:
+    r(i) = intrinsic_reward_coeff * (1 / C(obs(i)))
+    where C is the total (lifetime) count of the obs at timestep i.
+    The intrinsic reward is added to the extrinsic reward and saved back into the
+    episode (under the main "rewards" key).
+    Note that the computation and saving back to the episode all happens before the
+    actual train batch is generated from the episode data. Thus, the Learner and the
+    RLModule used do not take notice of the extra reward added.
+    If you would like to use a more sophisticated mechanism for intrinsic reward
+    computations, take a look at the `EuclidianDistanceBasedCuriosity` connector piece
+    at `ray.rllib.examples.connectors.classes.euclidian_distance_based_curiosity`
+    """
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        intrinsic_reward_coeff: float = 1.0,
+        **kwargs,
+    ):
+        """Initializes a CountBasedCuriosity instance.
+        Args:
+            intrinsic_reward_coeff: The weight with which to multiply the intrinsic
+                reward before adding (and saving) it back to the main (extrinsic)
+                reward of the episode at each timestep.
+        """
+        super().__init__(input_observation_space, input_action_space)
+        # Naive observation counter.
+        self._counts = Counter()
+        self.intrinsic_reward_coeff = intrinsic_reward_coeff
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Any,
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Loop through all episodes and change the reward to
+        # [reward + intrinsic reward]
+        for sa_episode in self.single_agent_episode_iterator(
+            episodes=episodes, agents_that_stepped_only=False
+        ):
+            # Loop through all obs, except the last one.
+            observations = sa_episode.get_observations(slice(None, -1))
+            # Get all respective (extrinsic) rewards.
+            rewards = sa_episode.get_rewards()
+            for i, (obs, rew) in enumerate(zip(observations, rewards)):
+                obs = tuple(obs)
+                # Add 1 to obs counter.
+                self._counts[obs] += 1
+                # Compute our count-based intrinsic reward and add it to the main
+                # (extrinsic) reward.
+                rew += self.intrinsic_reward_coeff * (1 / self._counts[obs])
+                # Store the new reward back to the episode (under the correct
+                # timestep/index).
+                sa_episode.set_rewards(new_data=rew, at_indices=i)
+        return batch

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/euclidian_distance_based_curiosity.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from collections import deque
+from typing import Any, List, Optional
+import gymnasium as gym
+import numpy as np
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.utils.typing import EpisodeType
+class EuclidianDistanceBasedCuriosity(ConnectorV2):
+    """Learner ConnectorV2 piece computing intrinsic rewards with euclidian distance.
+    Add this connector piece to your Learner pipeline, through your algo config:
+    ```
+    config.training(
+        learner_connector=lambda obs_sp, act_sp: EuclidianDistanceBasedCuriosity()
+    )
+    ```
+    Intrinsic rewards are computed on the Learner side based on comparing the euclidian
+    distance of observations vs already seen ones. A configurable number of observations
+    will be stored in a FIFO buffer and all incoming observations have their distance
+    measured against those.
+    The minimum distance measured is the intrinsic reward for the incoming obs
+    (multiplied by a fixed coeffieicnt and added to the "main" extrinsic reward):
+    r(i) = intrinsic_reward_coeff * min(ED(o, o(i)) for o in stored_obs))
+    where `ED` is the euclidian distance and `stored_obs` is the buffer.
+    The intrinsic reward is then added to the extrinsic reward and saved back into the
+    episode (under the main "rewards" key).
+    Note that the computation and saving back to the episode all happens before the
+    actual train batch is generated from the episode data. Thus, the Learner and the
+    RLModule used do not take notice of the extra reward added.
+    Only one observation per incoming episode will be stored as a new one in the buffer.
+    Thereby, we pick the observation with the largest `min(ED)` value over all already
+    stored observations to be stored per episode.
+    If you would like to use a simpler, count-based mechanism for intrinsic reward
+    computations, take a look at the `CountBasedCuriosity` connector piece
+    at `ray.rllib.examples.connectors.classes.count_based_curiosity`
+    """
+    def __init__(
+        self,
+        input_observation_space: Optional[gym.Space] = None,
+        input_action_space: Optional[gym.Space] = None,
+        *,
+        intrinsic_reward_coeff: float = 1.0,
+        max_buffer_size: int = 100,
+        **kwargs,
+    ):
+        """Initializes a CountBasedCuriosity instance.
+        Args:
+            intrinsic_reward_coeff: The weight with which to multiply the intrinsic
+                reward before adding (and saving) it back to the main (extrinsic)
+                reward of the episode at each timestep.
+        """
+        super().__init__(input_observation_space, input_action_space)
+        # Create an observation buffer
+        self.obs_buffer = deque(maxlen=max_buffer_size)
+        self.intrinsic_reward_coeff = intrinsic_reward_coeff
+        self._test = 0
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Any,
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        if self._test > 10:
+            return batch
+        self._test += 1
+        # Loop through all episodes and change the reward to
+        # [reward + intrinsic reward]
+        for sa_episode in self.single_agent_episode_iterator(
+            episodes=episodes, agents_that_stepped_only=False
+        ):
+            # Loop through all obs, except the last one.
+            observations = sa_episode.get_observations(slice(None, -1))
+            # Get all respective (extrinsic) rewards.
+            rewards = sa_episode.get_rewards()
+            max_dist_obs = None
+            max_dist = float("-inf")
+            for i, (obs, rew) in enumerate(zip(observations, rewards)):
+                # Compare obs to all stored observations and compute euclidian distance.
+                min_dist = 0.0
+                if self.obs_buffer:
+                    min_dist = min(
+                        np.sqrt(np.sum((obs - stored_obs) ** 2))
+                        for stored_obs in self.obs_buffer
+                    )
+                if min_dist > max_dist:
+                    max_dist = min_dist
+                    max_dist_obs = obs
+                # Compute our euclidian distance-based intrinsic reward and add it to
+                # the main (extrinsic) reward.
+                rew += self.intrinsic_reward_coeff * min_dist
+                # Store the new reward back to the episode (under the correct
+                # timestep/index).
+                sa_episode.set_rewards(new_data=rew, at_indices=i)
+            # Add the one observation of this episode with the largest (min) euclidian
+            # dist to all already stored obs to the buffer (maybe throwing out the
+            # oldest obs in there).
+            if max_dist_obs is not None:
+                self.obs_buffer.append(max_dist_obs)
+        return batch

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/classes/protobuf_cartpole_observation_decoder.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Any, List, Optional
+import gymnasium as gym
+import numpy as np
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.examples.envs.classes.utils.cartpole_observations_proto import (
+    CartPoleObservation,
+)
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.typing import EpisodeType
+class ProtobufCartPoleObservationDecoder(ConnectorV2):
+    """Env-to-module ConnectorV2 piece decoding protobuf obs into CartPole-v1 obs.
+    Add this connector piece to your env-to-module pipeline, through your algo config:
+    ```
+    config.env_runners(
+        env_to_module_connector=lambda env: ProtobufCartPoleObservationDecoder()
+    )
+    ```
+    The incoming observation space must be a 1D Box of dtype uint8
+    (which is the same as a binary string). The outgoing observation space is the
+    normal CartPole-v1 1D space: Box(-inf, inf, (4,), float32).
+    """
+    @override(ConnectorV2)
+    def recompute_output_observation_space(
+        self,
+        input_observation_space: gym.Space,
+        input_action_space: gym.Space,
+    ) -> gym.Space:
+        # Make sure the incoming observation space is a protobuf (binary string).
+        assert (
+            isinstance(input_observation_space, gym.spaces.Box)
+            and len(input_observation_space.shape) == 1
+            and input_observation_space.dtype.name == "uint8"
+        )
+        # Return CartPole-v1's natural observation space.
+        return gym.spaces.Box(float("-inf"), float("inf"), (4,), np.float32)
+    def __call__(
+        self,
+        *,
+        rl_module: RLModule,
+        batch: Any,
+        episodes: List[EpisodeType],
+        explore: Optional[bool] = None,
+        shared_data: Optional[dict] = None,
+        **kwargs,
+    ) -> Any:
+        # Loop through all episodes and change the observation from a binary string
+        # to an actual 1D np.ndarray (normal CartPole-v1 obs).
+        for sa_episode in self.single_agent_episode_iterator(episodes=episodes):
+            # Get last obs (binary string).
+            obs = sa_episode.get_observations(-1)
+            obs_bytes = obs.tobytes()
+            obs_protobuf = CartPoleObservation()
+            obs_protobuf.ParseFromString(obs_bytes)
+            # Set up the natural CartPole-v1 observation tensor from the protobuf
+            # values.
+            new_obs = np.array(
+                [
+                    obs_protobuf.x_pos,
+                    obs_protobuf.x_veloc,
+                    obs_protobuf.angle_pos,
+                    obs_protobuf.angle_veloc,
+                ],
+                np.float32,
+            )
+            # Write the new observation (1D tensor) back into the Episode.
+            sa_episode.set_observations(new_data=new_obs, at_indices=-1)
+        # Return `data` as-is.
+        return batch

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/count_based_curiosity.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Placeholder for training with count-based curiosity.
+The actual script can be found at a different location (see code below).
+"""
+if __name__ == "__main__":
+    import subprocess
+    import sys
+    # Forward to "python ../curiosity/[same script name].py [same options]"
+    command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]
+    # Run the script.
+    subprocess.run(command, capture_output=True)

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/euclidian_distance_based_curiosity.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Placeholder for training with euclidian distance-based curiosity.
+The actual script can be found at a different location (see code below).
+"""
+if __name__ == "__main__":
+    import subprocess
+    import sys
+    # Forward to "python ../curiosity/[same script name].py [same options]"
+    command = [sys.executable, "../curiosity/", sys.argv[0]] + sys.argv[1:]
+    # Run the script.
+    subprocess.run(command, capture_output=True)

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/flatten_observations_dict_space.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""Example using a ConnectorV2 to flatten arbitrarily nested dict or tuple observations.
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+This example:
+    - shows how the `FlattenObservation` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - demonstrates that by using this connector, any arbitrarily nested dict or tuple
+    observations is properly flattened into a simple 1D tensor, for easier RLModule
+    processing.
+    - shows how - in a multi-agent setup - individual agents can be specified, whose
+    observations should be flattened (while other agents' observations will always
+    be left as-is).
+    - uses a variant of the CartPole-v1 environment, in which the 4 observation items
+    (x-pos, x-veloc, angle, and angle-veloc) are taken apart and put into a nested dict
+    with the structure:
+    {
+        "x-pos": [x-pos],
+        "angular-pos": {
+            "value": [angle],
+            "some_random_stuff": [random Discrete(3)],  # <- should be ignored by algo
+        },
+        "velocs": Tuple([x-veloc], [angle-veloc]),
+    }
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
++---------------------+------------+----------------+--------+------------------+
+| Trial name          | status     | loc            |   iter |   total time (s) |
+|                     |            |                |        |                  |
+|---------------------+------------+----------------+--------+------------------+
+| PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 |     25 |          24.1426 |
++---------------------+------------+----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|   num_env_steps_sample |   num_env_steps_traine |   episode_return_mean  |
+|             d_lifetime |             d_lifetime |                        |
++------------------------+------------------------+------------------------|
+|                 100000 |                 100000 |                 421.42 |
++------------------------+------------------------+------------------------+
+"""
+from ray.tune.registry import register_env
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import (
+    CartPoleWithDictObservationSpace,
+)
+from ray.rllib.examples.envs.classes.multi_agent import (
+    MultiAgentCartPoleWithDictObservationSpace,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+# Read in common example script command line arguments.
+parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0)
+parser.set_defaults(enable_new_api_stack=True)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # Define env-to-module-connector pipeline for the new stack.
+    def _env_to_module_pipeline(env):
+        return FlattenObservations(multi_agent=args.num_agents > 0)
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda _: MultiAgentCartPoleWithDictObservationSpace(
+                config={"num_agents": args.num_agents}
+            ),
+        )
+    else:
+        register_env("env", lambda _: CartPoleWithDictObservationSpace())
+    # Define the AlgorithmConfig used.
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("env")
+        .env_runners(env_to_module_connector=_env_to_module_pipeline)
+        .training(
+            gamma=0.99,
+            lr=0.0003,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                fcnet_hiddens=[32],
+                fcnet_activation="linear",
+                vf_share_layers=True,
+            ),
+        )
+    )
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+    # PPO-specific settings (for better learning behavior only).
+    if args.algo == "PPO":
+        base_config.training(
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+        )
+    # IMPALA-specific settings (for better learning behavior only).
+    elif args.algo == "IMPALA":
+        base_config.training(
+            lr=0.0005,
+            vf_loss_coeff=0.05,
+            entropy_coeff=0.0,
+        )
+    # Run everything as configured.
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/frame_stacking.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""Example using 2 ConnectorV2 for observation frame-stacking in Atari environments.
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+This example:
+    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
+    learner connector pipeline.
+    - demonstrates that using these two pieces (rather than performing framestacking
+    already inside the environment using a gymnasium wrapper) increases overall
+    performance by about 5%.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
+and learner connector pipelines), you should see something like this using:
+`--env ALE/Pong-v5 --num-learners=4 --num-gpus-per-learner=1 --num-env-runners=95`
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |    200 |          335.837 |
++---------------------------+------------+--------+------------------+...
+Note that the time to run these 200 iterations is about ~5% faster than when
+performing framestacking already inside the environment (using a
+`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
+needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
+Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal),
+the output looks like this:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |    200 |          351.505 |
++---------------------------+------------+--------+------------------+...
+"""
+import gymnasium as gym
+from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
+from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
+from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+# Read in common example script command line arguments.
+parser = add_rllib_example_script_args(
+    default_timesteps=5000000, default_reward=20.0, default_iters=200
+)
+# Use Pong by default.
+parser.set_defaults(
+    enable_new_api_stack=True,
+    env="ale_py:ALE/Pong-v5",
+)
+parser.add_argument(
+    "--num-frames",
+    type=int,
+    default=4,
+    help="The number of observation frames to stack.",
+)
+parser.add_argument(
+    "--use-gym-wrapper-framestacking",
+    action="store_true",
+    help="Whether to use RLlib's Atari wrapper's framestacking capabilities (as "
+    "opposed to doing it via a specific ConenctorV2 pipeline).",
+)
+if __name__ == "__main__":
+    from ray import tune
+    args = parser.parse_args()
+    # Define our custom connector pipelines.
+    def _make_env_to_module_connector(env):
+        # Create the env-to-module connector. We return an individual connector piece
+        # here, which RLlib automatically integrates into a pipeline (and
+        # add its default connector piece to the end of that pipeline).
+        # The default pipeline automatically fixes the input- and output spaces of the
+        # individual connector pieces in it.
+        # Note that since the frame stacking connector does NOT write information
+        # back to the episode (in order to save memory and network traffic), we
+        # also need to perform the same procedure on the Learner end (see below
+        # where we set up the Learner pipeline).
+        return FrameStackingEnvToModule(
+            num_frames=args.num_frames,
+            multi_agent=args.num_agents > 0,
+        )
+    def _make_learner_connector(input_observation_space, input_action_space):
+        # Create the learner connector.
+        return FrameStackingLearner(
+            num_frames=args.num_frames,
+            multi_agent=args.num_agents > 0,
+        )
+    # Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it).
+    # We would like our frame stacking connector to do this job.
+    def _env_creator(cfg):
+        return wrap_atari_for_new_api_stack(
+            gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}),
+            # Perform framestacking either through ConnectorV2 or right here through
+            # the observation wrapper.
+            framestack=(
+                args.num_frames if args.use_gym_wrapper_framestacking else None
+            ),
+        )
+    if args.num_agents > 0:
+        tune.register_env(
+            "atari-env",
+            lambda cfg: make_multi_agent(_env_creator)(
+                dict(cfg, **{"num_agents": args.num_agents})
+            ),
+        )
+    else:
+        tune.register_env("atari-env", _env_creator)
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "atari-env",
+            env_config={
+                # Make analogous to old v4 + NoFrameskip.
+                "frameskip": 1,
+                "full_action_space": False,
+                "repeat_action_probability": 0.0,
+            },
+            clip_rewards=True,
+        )
+        .env_runners(
+            # ... new EnvRunner and our frame stacking env-to-module connector.
+            env_to_module_connector=(
+                None
+                if args.use_gym_wrapper_framestacking
+                else _make_env_to_module_connector
+            ),
+            num_envs_per_env_runner=1 if args.num_agents > 0 else 2,
+        )
+        .training(
+            # Use our frame stacking learner connector.
+            learner_connector=(
+                None if args.use_gym_wrapper_framestacking else _make_learner_connector
+            ),
+            entropy_coeff=0.01,
+            # Linearly adjust learning rate based on number of GPUs.
+            lr=0.00015 * (args.num_learners or 1),
+            grad_clip=100.0,
+            grad_clip_by="global_norm",
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                vf_share_layers=True,
+                conv_filters=[(16, 4, 2), (32, 4, 2), (64, 4, 2), (128, 4, 2)],
+                conv_activation="relu",
+                head_fcnet_hiddens=[256],
+            ),
+        )
+    )
+    # PPO specific settings.
+    if args.algo == "PPO":
+        base_config.training(
+            num_epochs=10,
+            minibatch_size=64,
+            lambda_=0.95,
+            kl_coeff=0.5,
+            clip_param=0.1,
+            vf_clip_param=10.0,
+        )
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+    # Run everything as configured.
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/mean_std_filtering.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""Example using a ConnectorV2 for processing observations with a mean/std filter.
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+This example:
+    - shows how the `MeanStdFilter` ConnectorV2 piece can be added to the env-to-module
+    pipeline.
+    - demonstrates that using such a filter enhances learning behavior (or even makes
+    if possible to learn overall) in some environments, especially those with lopsided
+    observation spaces, for example `Box(-3000, -1000, ...)`.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+Running this example with the mean-std filter results in the normally expected Pendulum
+learning behavior:
++-------------------------------+------------+-----------------+--------+
+| Trial name                    | status     | loc             |   iter |
+|                               |            |                 |        |
+|-------------------------------+------------+-----------------+--------+
+| PPO_lopsided-pend_f9c96_00000 | TERMINATED | 127.0.0.1:43612 |     77 |
++-------------------------------+------------+-----------------+--------+
++------------------+------------------------+-----------------------+
+|   total time (s) |   num_env_steps_sample |   episode_return_mean |
+|                  |             d_lifetime |                       |
+|------------------+------------------------+-----------------------|
+|          30.7466 |                  40040 |                -276.3 |
++------------------+------------------------+-----------------------+
+If you try using the `--disable-mean-std-filter` (all other things being equal), you
+will either see no learning progress at all (or a very slow one), but more likely some
+numerical instability related error will be thrown:
+ValueError: Expected parameter loc (Tensor of shape (64, 1)) of distribution
+            Normal(loc: torch.Size([64, 1]), scale: torch.Size([64, 1])) to satisfy the
+            constraint Real(), but found invalid values:
+tensor([[nan],
+        [nan],
+        [nan],
+        ...
+"""
+import gymnasium as gym
+import numpy as np
+from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env
+torch, _ = try_import_torch()
+parser = add_rllib_example_script_args(
+    default_iters=500,
+    default_timesteps=500000,
+    default_reward=-300.0,
+)
+parser.add_argument(
+    "--disable-mean-std-filter",
+    action="store_true",
+    help="Run w/o a mean/std env-to-module connector piece (filter).",
+)
+class LopsidedObs(gym.ObservationWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self.observation_space = gym.spaces.Box(-4000.0, -1456.0, (3,), np.float32)
+    def observation(self, observation):
+        # Lopside [-1.0, 1.0] Pendulum observations
+        return ((observation + 1.0) / 2.0) * (4000.0 - 1456.0) - 4000.0
+if __name__ == "__main__":
+    args = parser.parse_args()
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "lopsided-pend",
+            lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}),
+        )
+    else:
+        register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1")))
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("lopsided-pend")
+        .env_runners(
+            # TODO (sven): MAEnvRunner does not support vectorized envs yet
+            #  due to gym's env checkers and non-compatability with RLlib's
+            #  MultiAgentEnv API.
+            num_envs_per_env_runner=1 if args.num_agents > 0 else 20,
+            # Define a single connector piece to be prepended to the env-to-module
+            # connector pipeline.
+            # Alternatively, return a list of n ConnectorV2 pieces (which will then be
+            # included in an automatically generated EnvToModulePipeline or return a
+            # EnvToModulePipeline directly.
+            env_to_module_connector=(
+                None
+                if args.disable_mean_std_filter
+                else lambda env: MeanStdFilter(multi_agent=args.num_agents > 0)
+            ),
+        )
+        .training(
+            train_batch_size_per_learner=512,
+            gamma=0.95,
+            # Linearly adjust learning rate based on number of GPUs.
+            lr=0.0003 * (args.num_learners or 1),
+            vf_loss_coeff=0.01,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                fcnet_activation="relu",
+                fcnet_kernel_initializer=torch.nn.init.xavier_uniform_,
+                fcnet_bias_initializer=torch.nn.init.constant_,
+                fcnet_bias_initializer_kwargs={"val": 0.0},
+            ),
+        )
+        # In case you would like to run with a evaluation EnvRunners, make sure your
+        # `evaluation_config` key contains the `use_worker_filter_stats=False` setting
+        # (see below). This setting makes sure that the mean/std stats collected by the
+        # evaluation EnvRunners are NOT used for the training EnvRunners (unless you
+        # really want to mix these stats). It's normally a good idea to keep the stats
+        # collected during evaluation completely out of the training data (already for
+        # better reproducibility alone).
+        # .evaluation(
+        #    evaluation_num_env_runners=1,
+        #    evaluation_interval=1,
+        #    evaluation_config={
+        #        "explore": False,
+        #        # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before
+        #        # each round of evaluation, broadcast the latest training
+        #        # EnvRunnerGroup's ConnectorV2 states (merged from all training remote
+        #        # EnvRunners) to the eval EnvRunnerGroup (and discard the eval
+        #        # EnvRunners' stats).
+        #        "use_worker_filter_stats": False,
+        #    },
+        # )
+    )
+    # PPO specific settings.
+    if args.algo == "PPO":
+        base_config.training(
+            minibatch_size=64,
+            lambda_=0.1,
+            vf_clip_param=10.0,
+        )
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/connectors/prev_actions_prev_rewards.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Example using a ConnectorV2 to add previous rewards/actions to an RLModule's input.
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+This example:
+    - shows how the `PrevActionsPrevRewards` ConnectorV2 piece can be added to the
+    env-to-module pipeline to extract previous rewards and/or actions from the ongoing
+    episodes.
+    - shows how this connector creates  and wraps this new information (rewards and
+    actions) together with the original observations into the RLModule's input dict
+    under a new `gym.spaces.Dict` structure (for example, if your observation space
+    is `O=Box(shape=(3,))` and you add the most recent 1 reward, the new observation
+    space will be `Dict({"_original_obs": O, "prev_n_rewards": Box(shape=())})`.
+    - demonstrates how to use RLlib's `FlattenObservations` right after the
+    `PrevActionsPrevRewards` to flatten that new dict observation structure again into
+    a single 1D tensor.
+    - uses the StatelessCartPole environment, a CartPole-v1 derivative that's missing
+    both x-veloc and angle-veloc observation components and is therefore non-Markovian
+    (only partially observable). An LSTM default model is used for training. Adding
+    the additional context to the observations (for example, prev. actions) helps the
+    LSTM to more quickly learn in this environment.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+You should see something similar to this in your terminal output when running
+ths script as described above:
++---------------------+------------+-----------------+--------+------------------+
+| Trial name          | status     | loc             |   iter |   total time (s) |
+|                     |            |                 |        |                  |
+|---------------------+------------+-----------------+--------+------------------+
+| PPO_env_0edd2_00000 | TERMINATED | 127.0.0.1:12632 |     17 |          42.6898 |
++---------------------+------------+-----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|   num_env_steps_sample |   num_env_steps_traine |   episode_return_mean  |
+|             d_lifetime |             d_lifetime |                        |
+|------------------------+------------------------+------------------------|
+|                  68000 |                  68000 |                 205.22 |
++------------------------+------------------------+------------------------+
+"""
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.connectors.env_to_module import (
+    FlattenObservations,
+    PrevActionsPrevRewards,
+)
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
+from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune import register_env
+torch, nn = try_import_torch()
+parser = add_rllib_example_script_args(
+    default_reward=200.0, default_timesteps=1000000, default_iters=2000
+)
+parser.set_defaults(enable_new_api_stack=True)
+parser.add_argument("--n-prev-rewards", type=int, default=1)
+parser.add_argument("--n-prev-actions", type=int, default=1)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # Define our custom connector pipelines.
+    def _env_to_module(env):
+        # Create the env-to-module connector pipeline.
+        return [
+            PrevActionsPrevRewards(
+                multi_agent=args.num_agents > 0,
+                n_prev_rewards=args.n_prev_rewards,
+                n_prev_actions=args.n_prev_actions,
+            ),
+            FlattenObservations(multi_agent=args.num_agents > 0),
+        ]
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda _: MultiAgentStatelessCartPole(
+                config={"num_agents": args.num_agents}
+            ),
+        )
+    else:
+        register_env("env", lambda _: StatelessCartPole())
+    config = (
+        PPOConfig()
+        .environment("env")
+        .env_runners(env_to_module_connector=_env_to_module)
+        .training(
+            num_epochs=6,
+            lr=0.0003,
+            train_batch_size=4000,
+            vf_loss_coeff=0.01,
+        )
+        .rl_module(
+            model_config=DefaultModelConfig(
+                use_lstm=True,
+                max_seq_len=20,
+                fcnet_hiddens=[32],
+                fcnet_activation="linear",
+                fcnet_kernel_initializer=nn.init.xavier_uniform_,
+                fcnet_bias_initializer=nn.init.constant_,
+                fcnet_bias_initializer_kwargs={"val": 0.0},
+                vf_share_layers=True,
+            ),
+        )
+    )
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        config = config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+    run_rllib_example_script_experiment(config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""Example of customizing the evaluation procedure for an RLlib Algorithm.
+Note, that you should only choose to provide a custom eval function, in case the already
+built-in eval options are not sufficient. Normally, though, RLlib's eval utilities
+that come with each Algorithm are enough to properly evaluate the learning progress
+of your Algorithm.
+This script uses the SimpleCorridor environment, a simple 1D gridworld, in which
+the agent can only walk left (action=0) or right (action=1). The goal state is located
+at the end of the (1D) corridor. The env exposes an API to change the length of the
+corridor on-the-fly. We use this API here to extend the size of the corridor for the
+evaluation runs.
+For demonstration purposes only, we define a simple custom evaluation method that does
+the following:
+- It changes the corridor length of all environments used on the evaluation EnvRunners.
+- It runs a defined number of episodes for evaluation purposes.
+- It collects the metrics from those runs, summarizes these metrics and returns them.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+You can switch off custom evaluation (and use RLlib's default evaluation procedure)
+with the `--no-custom-eval` flag.
+You can switch on parallel evaluation to training using the
+`--evaluation-parallel-to-training` flag. See this example script here:
+https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py  # noqa
+for more details on running evaluation parallel to training.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+You should see the following (or very similar) console output when running this script.
+Note that for each iteration, due to the definition of our custom evaluation function,
+we run 3 evaluation rounds per single training round.
+...
+Training iteration 1 -> evaluation round 0
+Training iteration 1 -> evaluation round 1
+Training iteration 1 -> evaluation round 2
+...
+...
++--------------------------------+------------+-----------------+--------+
+| Trial name                     | status     | loc             |   iter |
+|--------------------------------+------------+-----------------+--------+
+| PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 |      4 |
++--------------------------------+------------+-----------------+--------+
++------------------+-------+----------+--------------------+
+|   total time (s) |    ts |   reward |   episode_len_mean |
+|------------------+-------+----------+--------------------|
+|          26.1973 | 16000 | 0.872034 |            13.7966 |
++------------------+-------+----------+--------------------+
+"""
+from typing import Tuple
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
+from ray.rllib.env.env_runner_group import EnvRunnerGroup
+from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EVALUATION_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.rllib.utils.typing import ResultDict
+from ray.tune.registry import get_trainable_cls
+parser = add_rllib_example_script_args(
+    default_iters=50, default_reward=0.7, default_timesteps=50000
+)
+parser.add_argument("--no-custom-eval", action="store_true")
+parser.add_argument("--corridor-length-training", type=int, default=10)
+parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20)
+parser.add_argument("--corridor-length-eval-worker-2", type=int, default=30)
+def custom_eval_function(
+    algorithm: Algorithm,
+    eval_workers: EnvRunnerGroup,
+) -> Tuple[ResultDict, int, int]:
+    """Example of a custom evaluation function.
+    Args:
+        algorithm: Algorithm class to evaluate.
+        eval_workers: Evaluation EnvRunnerGroup.
+    Returns:
+        metrics: Evaluation metrics dict.
+    """
+    # Set different env settings for each (eval) EnvRunner. Here we use the EnvRunner's
+    # `worker_index` property to figure out the actual length.
+    # Loop through all workers and all sub-envs (gym.Env) on each worker and call the
+    # `set_corridor_length` method on these.
+    eval_workers.foreach_env_runner(
+        func=lambda worker: (
+            env.unwrapped.set_corridor_length(
+                args.corridor_length_eval_worker_1
+                if worker.worker_index == 1
+                else args.corridor_length_eval_worker_2
+            )
+            for env in worker.env.unwrapped.envs
+        )
+    )
+    # Collect metrics results collected by eval workers in this list for later
+    # processing.
+    env_runner_metrics = []
+    sampled_episodes = []
+    # For demonstration purposes, run through some number of evaluation
+    # rounds within this one call. Note that this function is called once per
+    # training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()`
+    # (which can be called manually by the user).
+    for i in range(3):
+        print(f"Training iteration {algorithm.iteration} -> evaluation round {i}")
+        # Sample episodes from the EnvRunners AND have them return only the thus
+        # collected metrics.
+        episodes_and_metrics_all_env_runners = eval_workers.foreach_env_runner(
+            # Return only the metrics, NOT the sampled episodes (we don't need them
+            # anymore).
+            func=lambda worker: (worker.sample(), worker.get_metrics()),
+            local_env_runner=False,
+        )
+        sampled_episodes.extend(
+            eps
+            for eps_and_mtrcs in episodes_and_metrics_all_env_runners
+            for eps in eps_and_mtrcs[0]
+        )
+        env_runner_metrics.extend(
+            eps_and_mtrcs[1] for eps_and_mtrcs in episodes_and_metrics_all_env_runners
+        )
+    # You can compute metrics from the episodes manually, or use the Algorithm's
+    # convenient MetricsLogger to store all evaluation metrics inside the main
+    # algo.
+    algorithm.metrics.merge_and_log_n_dicts(
+        env_runner_metrics, key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS)
+    )
+    eval_results = algorithm.metrics.reduce(
+        key=(EVALUATION_RESULTS, ENV_RUNNER_RESULTS)
+    )
+    # Alternatively, you could manually reduce over the n returned `env_runner_metrics`
+    # dicts, but this would be much harder as you might not know, which metrics
+    # to sum up, which ones to average over, etc..
+    # Compute env and agent steps from sampled episodes.
+    env_steps = sum(eps.env_steps() for eps in sampled_episodes)
+    agent_steps = sum(eps.agent_steps() for eps in sampled_episodes)
+    return eval_results, env_steps, agent_steps
+if __name__ == "__main__":
+    args = parser.parse_args()
+    args.local_mode = True
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        # For training, we use a corridor length of n. For evaluation, we use different
+        # values, depending on the eval worker index (1 or 2).
+        .environment(
+            SimpleCorridor,
+            env_config={"corridor_length": args.corridor_length_training},
+        )
+        .evaluation(
+            # Do we use the custom eval function defined above?
+            custom_evaluation_function=(
+                None if args.no_custom_eval else custom_eval_function
+            ),
+            # Number of eval EnvRunners to use.
+            evaluation_num_env_runners=2,
+            # Enable evaluation, once per training iteration.
+            evaluation_interval=1,
+            # Run 10 episodes each time evaluation runs (OR "auto" if parallel to
+            # training).
+            evaluation_duration="auto" if args.evaluation_parallel_to_training else 10,
+            # Evaluate parallelly to training?
+            evaluation_parallel_to_training=args.evaluation_parallel_to_training,
+            # Override the env settings for the eval workers.
+            # Note, though, that this setting here is only used in case --no-custom-eval
+            # is set, b/c in case the custom eval function IS used, we override the
+            # length of the eval environments in that custom function, so this setting
+            # here is simply ignored.
+            evaluation_config=AlgorithmConfig.overrides(
+                env_config={"corridor_length": args.corridor_length_training * 2},
+                # TODO (sven): Add support for window=float(inf) and reduce=mean for
+                #  evaluation episode_return_mean reductions (identical to old stack
+                #  behavior, which does NOT use a window (100 by default) to reduce
+                #  eval episode returns.
+                metrics_num_episodes_for_smoothing=5,
+            ),
+        )
+    )
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
+            args.stop_reward
+        ),
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+    }
+    run_rllib_example_script_experiment(
+        base_config,
+        args,
+        stop=stop,
+        success_metric={
+            f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
+                args.stop_reward
+            ),
+        },
+    )

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (196 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/float16_training_and_inference.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/fractional_gpus_per_learner.cpython-311.pyc ADDED Viewed

Binary file (5.57 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/gpus_on_env_runners.cpython-311.pyc ADDED Viewed

Binary file (3.66 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/__pycache__/mixed_precision_training_float16_inference.cpython-311.pyc ADDED Viewed

Binary file (8.71 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/float16_training_and_inference.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""Example of using float16 precision for training and inference.
+This example:
+    - shows how to write a custom callback for RLlib to convert all RLModules
+    (on the EnvRunners and Learners) to float16 precision.
+    - shows how to write a custom env-to-module ConnectorV2 piece to convert all
+    observations and rewards in the collected trajectories to float16 (numpy) arrays.
+    - shows how to write a custom grad scaler for torch that is necessary to stabilize
+    learning with float16 weight matrices and gradients. This custom scaler behaves
+    exactly like the torch built-in `torch.amp.GradScaler` but also works for float16
+    gradients (which the torch built-in one doesn't).
+    - shows how to write a custom TorchLearner to change the epsilon setting (to the
+    much larger 1e-4 to stabilize learning) on the default optimizer (Adam) registered
+    for each RLModule.
+    - demonstrates how to plug in all the above custom components into an
+    `AlgorithmConfig` instance and start training (and inference) with float16
+    precision.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+You can visualize experiment results in ~/ray_results using TensorBoard.
+Results to expect
+-----------------
+You should see something similar to the following on your terminal, when running this
+script with the above recommended options:
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|                             |            |                 |        |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_437ee_00000 | TERMINATED | 127.0.0.1:81045 |      6 |
++-----------------------------+------------+-----------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |    episode_return_mean |  num_episodes_lifetime |
+|                  |                        |                        |
+|------------------+------------------------+------------------------+
+|          71.3123 |                 153.79 |                    358 |
++------------------+------------------------+------------------------+
+"""
+import gymnasium as gym
+import numpy as np
+import torch
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+parser = add_rllib_example_script_args(
+    default_iters=50, default_reward=150.0, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+)
+def on_algorithm_init(
+    algorithm: Algorithm,
+    **kwargs,
+) -> None:
+    """Callback making sure that all RLModules in the algo are `half()`'ed."""
+    # Switch all Learner RLModules to float16.
+    algorithm.learner_group.foreach_learner(
+        lambda learner: learner.module.foreach_module(lambda mid, mod: mod.half())
+    )
+    # Switch all EnvRunner RLModules (assuming single RLModules) to float16.
+    algorithm.env_runner_group.foreach_env_runner(
+        lambda env_runner: env_runner.module.half()
+    )
+    if algorithm.eval_env_runner_group:
+        algorithm.eval_env_runner_group.foreach_env_runner(
+            lambda env_runner: env_runner.module.half()
+        )
+class WriteObsAndRewardsAsFloat16(ConnectorV2):
+    """ConnectorV2 piece preprocessing observations and rewards to be float16.
+    Note that users can also write a gymnasium.Wrapper for observations and rewards
+    to achieve the same thing.
+    """
+    def recompute_output_observation_space(
+        self,
+        input_observation_space,
+        input_action_space,
+    ):
+        return gym.spaces.Box(
+            input_observation_space.low.astype(np.float16),
+            input_observation_space.high.astype(np.float16),
+            input_observation_space.shape,
+            np.float16,
+        )
+    def __call__(self, *, rl_module, batch, episodes, **kwargs):
+        for sa_episode in self.single_agent_episode_iterator(episodes):
+            obs = sa_episode.get_observations(-1)
+            float16_obs = obs.astype(np.float16)
+            sa_episode.set_observations(new_data=float16_obs, at_indices=-1)
+            if len(sa_episode) > 0:
+                rew = sa_episode.get_rewards(-1).astype(np.float16)
+                sa_episode.set_rewards(new_data=rew, at_indices=-1)
+        return batch
+class Float16GradScaler:
+    """Custom grad scaler for `TorchLearner`.
+    This class is utilizing the experimental support for the `TorchLearner`'s support
+    for loss/gradient scaling (analogous to how a `torch.amp.GradScaler` would work).
+    TorchLearner performs the following steps using this class (`scaler`):
+    - loss_per_module = TorchLearner.compute_losses()
+    - for L in loss_per_module: L = scaler.scale(L)
+    - grads = TorchLearner.compute_gradients()  # L.backward() on scaled loss
+    - TorchLearner.apply_gradients(grads):
+        for optim in optimizers:
+            scaler.step(optim)  # <- grads should get unscaled
+            scaler.update()  # <- update scaling factor
+    """
+    def __init__(
+        self,
+        init_scale=1000.0,
+        growth_factor=2.0,
+        backoff_factor=0.5,
+        growth_interval=2000,
+    ):
+        self._scale = init_scale
+        self.growth_factor = growth_factor
+        self.backoff_factor = backoff_factor
+        self.growth_interval = growth_interval
+        self._found_inf_or_nan = False
+        self.steps_since_growth = 0
+    def scale(self, loss):
+        # Scale the loss by `self._scale`.
+        return loss * self._scale
+    def get_scale(self):
+        return self._scale
+    def step(self, optimizer):
+        # Unscale the gradients for all model parameters and apply.
+        for group in optimizer.param_groups:
+            for param in group["params"]:
+                if param.grad is not None:
+                    param.grad.data.div_(self._scale)
+                    if torch.isinf(param.grad).any() or torch.isnan(param.grad).any():
+                        self._found_inf_or_nan = True
+                        break
+            if self._found_inf_or_nan:
+                break
+        # Only step if no inf/NaN grad found.
+        if not self._found_inf_or_nan:
+            optimizer.step()
+    def update(self):
+        # If gradients are found to be inf/NaN, reduce the scale.
+        if self._found_inf_or_nan:
+            self._scale *= self.backoff_factor
+            self.steps_since_growth = 0
+        # Increase the scale after a set number of steps without inf/NaN.
+        else:
+            self.steps_since_growth += 1
+            if self.steps_since_growth >= self.growth_interval:
+                self._scale *= self.growth_factor
+                self.steps_since_growth = 0
+        # Reset inf/NaN flag.
+        self._found_inf_or_nan = False
+class LargeEpsAdamTorchLearner(PPOTorchLearner):
+    """A TorchLearner overriding the default optimizer (Adam) to use non-default eps."""
+    @override(TorchLearner)
+    def configure_optimizers_for_module(self, module_id, config):
+        """Registers an Adam optimizer with a larg epsilon under the given module_id."""
+        params = list(self._module[module_id].parameters())
+        # Register one Adam optimizer (under the default optimizer name:
+        # DEFAULT_OPTIMIZER) for the `module_id`.
+        self.register_optimizer(
+            module_id=module_id,
+            # Create an Adam optimizer with a different eps for better float16
+            # stability.
+            optimizer=torch.optim.Adam(params, eps=1e-4),
+            params=params,
+            # Let RLlib handle the learning rate/learning rate schedule.
+            # You can leave `lr_or_lr_schedule` at None, but then you should
+            # pass a fixed learning rate into the Adam constructor above.
+            lr_or_lr_schedule=config.lr,
+        )
+if __name__ == "__main__":
+    args = parser.parse_args()
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment("CartPole-v1")
+        # Plug in our custom callback (on_algorithm_init) to make all RLModules
+        # float16 models.
+        .callbacks(on_algorithm_init=on_algorithm_init)
+        # Plug in our custom loss scaler class to stabilize gradient computations
+        # (by scaling the loss, then unscaling the gradients before applying them).
+        # This is using the built-in, experimental feature of TorchLearner.
+        .experimental(_torch_grad_scaler_class=Float16GradScaler)
+        # Plug in our custom env-to-module ConnectorV2 piece to convert all observations
+        # and reward in the episodes (permanently) to float16.
+        .env_runners(env_to_module_connector=lambda env: WriteObsAndRewardsAsFloat16())
+        .training(
+            # Plug in our custom TorchLearner (using a much larger, stabilizing epsilon
+            # on the Adam optimizer).
+            learner_class=LargeEpsAdamTorchLearner,
+            # Switch off grad clipping entirely b/c we use our custom grad scaler with
+            # built-in inf/nan detection (see `step` method of `Float16GradScaler`).
+            grad_clip=None,
+            # Typical CartPole-v1 hyperparams known to work well:
+            gamma=0.99,
+            lr=0.0003,
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+            use_kl_loss=True,
+        )
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/fractional_gpus_per_learner.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""Example of using fractional GPUs (< 1.0) per Learner worker.
+The number of GPUs required, just for learning (excluding those maybe needed on your
+EnvRunners, if applicable) can be computed by:
+`num_gpus = config.num_learners * config.num_gpus_per_learner`
+This example:
+  - shows how to set up an Algorithm that uses one or more Learner workers ...
+  - ... and how to assign a fractional (< 1.0) number of GPUs to each of these Learners.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-learners=
+[number of Learners, e.g. 1] --num-gpus-per-learner [some fraction <1.0]`
+The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU)
+machine.
+Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4
+learning rates in the `base_config` below:
+1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used).
+2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used).
+3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used).
+4) --num-learners=2 --num-gpus-per-learner=1 (8 GPUs used).
+5) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an
+NCCL-related error due to the fact that torch will try to perform DDP sharding,
+but notices that the shards sit on the same GPU).
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+Note that the shown GPU settings in this script also work in case you are not
+running via tune, but instead are using the `--no-tune` command line option.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+You can visualize experiment results in ~/ray_results using TensorBoard.
+Results to expect
+-----------------
+In the console output, you can see that only fractional GPUs are being used by RLlib:
+== Status ==
+...
+Logical resource usage: 12.0/16 CPUs, 1.0/4 GPUs (...)
+...
+Number of trials: 4/4 (4 RUNNING)
+The final output should look something like this:
++-----------------------------+------------+-----------------+--------+--------+
+| Trial name                  | status     | loc             |     lr |   iter |
+|                             |            |                 |        |        |
+|-----------------------------+------------+-----------------+--------+--------+
+| PPO_CartPole-v1_7104b_00000 | TERMINATED | 10.0.0.39:31197 | 0.005  |     10 |
+| PPO_CartPole-v1_7104b_00001 | TERMINATED | 10.0.0.39:31202 | 0.003  |     11 |
+| PPO_CartPole-v1_7104b_00002 | TERMINATED | 10.0.0.39:31203 | 0.001  |     10 |
+| PPO_CartPole-v1_7104b_00003 | TERMINATED | 10.0.0.39:31204 | 0.0001 |     11 |
++-----------------------------+------------+-----------------+--------+--------+
++----------------+----------------------+----------------------+----------------------+
+| total time (s) | num_env_steps_sample | num_env_steps_traine | num_episodes_lifetim |
+|                |           d_lifetime |           d_lifetime |                    e |
+|----------------+----------------------+----------------------+----------------------|
+|        101.002 |                40000 |                40000 |                  346 |
+|        110.03  |                44000 |                44000 |                  395 |
+|        101.171 |                40000 |                40000 |                  328 |
+|        110.091 |                44000 |                44000 |                  478 |
++----------------+----------------------+----------------------+----------------------+
+"""
+from ray import tune
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+parser = add_rllib_example_script_args(
+    default_iters=50, default_reward=180, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_env_runners=2,
+)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        # This script only works on the new API stack.
+        .api_stack(
+            enable_rl_module_and_learner=True,
+            enable_env_runner_and_connector_v2=True,
+        )
+        .environment("CartPole-v1")
+        # Define EnvRunner scaling.
+        .env_runners(num_env_runners=args.num_env_runners)
+        # Define Learner scaling.
+        .learners(
+            # How many Learner workers do we need? If you have more than 1 GPU,
+            # set this parameter to the number of GPUs available.
+            num_learners=args.num_learners,
+            # How many GPUs does each Learner need? If you have more than 1 GPU or only
+            # one Learner, you should set this to 1, otherwise, set this to some
+            # fraction.
+            num_gpus_per_learner=args.num_gpus_per_learner,
+        )
+        # 4 tune trials altogether.
+        .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001]))
+    )
+    run_rllib_example_script_experiment(base_config, args, keep_config=True)

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/gpus_on_env_runners.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""Example of using GPUs on the EnvRunners (b/c Env and/or RLModule require these).
+The number of GPUs required, just for your EnvRunners (excluding those needed for
+training your RLModule) can be computed by:
+`num_gpus = config.num_env_runners * config.num_gpus_per_env_runner`
+This example:
+  - shows how to write an Env that uses the GPU.
+  - shows how to configure your algorithm such that it allocates any number of GPUs
+  (including fractional < 1.0) to each (remote) EnvRunner worker.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-env_runners=
+[number of EnvRunners, e.g. 2] --num-gpus-per-env-runner [int or some fraction <1.0]`
+The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU)
+machine.
+TODO (sven): Fix these
+Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4
+learning rates in the `base_config` below:
+1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used).
+2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used).
+3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used).
+4) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an
+NCCL-related error due to the fact that torch will try to perform DDP sharding,
+but notices that the shards sit on the same GPU).
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+Note that the shown GPU settings in this script also work in case you are not
+running via tune, but instead are using the `--no-tune` command line option.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+You can visualize experiment results in ~/ray_results using TensorBoard.
+Results to expect
+-----------------
+In the console output, you can see that only fractional GPUs are being used by RLlib:
+"""
+from ray.rllib.examples.envs.classes.gpu_requiring_env import GPURequiringEnv
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+parser = add_rllib_example_script_args(
+    default_iters=50, default_reward=0.9, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_env_runners=2,
+)
+parser.add_argument("--num-gpus-per-env-runner", type=float, default=0.5)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(GPURequiringEnv)
+        # Define Learner scaling.
+        .env_runners(
+            # How many EnvRunner workers do we need?
+            num_env_runners=args.num_env_runners,
+            # How many GPUs does each EnvRunner require? Note that the memory on (a
+            # possibly fractional GPU) must be enough to accommodate the RLModule AND
+            # if applicable also the Env's GPU needs).
+            num_gpus_per_env_runner=args.num_gpus_per_env_runner,
+        )
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/gpus/mixed_precision_training_float16_inference.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""Example of using automatic mixed precision training on a torch RLModule.
+This example:
+    - shows how to write a custom callback for RLlib to convert those RLModules
+    only(!) on the EnvRunners to float16 precision.
+    - shows how to write a custom env-to-module ConnectorV2 piece to add float16
+    observations to the action computing forward batch on the EnvRunners, but NOT
+    permanently write these changes into the episodes, such that on the
+    Learner side, the original float32 observations will be used (for the mixed
+    precision `forward_train` and `loss` computations).
+    - shows how to plugin torch's built-in `GradScaler` class to be used by the
+    TorchLearner to scale losses and unscale gradients in order to gain more stability
+    when training with mixed precision.
+    - shows how to write a custom TorchLearner to run the update step (overrides
+    `_update()`) within a `torch.amp.autocast()` context. This makes sure that .
+    - demonstrates how to plug in all the above custom components into an
+    `AlgorithmConfig` instance and start training with mixed-precision while
+    performing the inference on the EnvRunners with float16 precision.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+Note that the shown GPU settings in this script also work in case you are not
+running via tune, but instead are using the `--no-tune` command line option.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+You can visualize experiment results in ~/ray_results using TensorBoard.
+Results to expect
+-----------------
+In the console output, you should see something like this:
++-----------------------------+------------+-----------------+--------+
+| Trial name                  | status     | loc             |   iter |
+|                             |            |                 |        |
+|-----------------------------+------------+-----------------+--------+
+| PPO_CartPole-v1_485af_00000 | TERMINATED | 127.0.0.1:81045 |     22 |
++-----------------------------+------------+-----------------+--------+
++------------------+------------------------+------------------------+
+|   total time (s) |    episode_return_mean |  num_episodes_lifetime |
+|                  |                        |                        |
+|------------------+------------------------+------------------------+
+|         281.3231 |                 455.81 |                   1426 |
++------------------+------------------------+------------------------+
+"""
+import gymnasium as gym
+import numpy as np
+import torch
+from ray.rllib.algorithms.algorithm import Algorithm
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.algorithms.ppo.torch.ppo_torch_learner import PPOTorchLearner
+from ray.rllib.connectors.connector_v2 import ConnectorV2
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+parser = add_rllib_example_script_args(
+    default_iters=200, default_reward=450.0, default_timesteps=200000
+)
+parser.set_defaults(
+    algo="PPO",
+    enable_new_api_stack=True,
+)
+def on_algorithm_init(
+    algorithm: Algorithm,
+    **kwargs,
+) -> None:
+    """Callback making sure that all RLModules in the algo are `half()`'ed."""
+    # Switch all EnvRunner RLModules (assuming single RLModules) to float16.
+    algorithm.env_runner_group.foreach_env_runner(
+        lambda env_runner: env_runner.module.half()
+    )
+    if algorithm.eval_env_runner_group:
+        algorithm.eval_env_runner_group.foreach_env_runner(
+            lambda env_runner: env_runner.module.half()
+        )
+class Float16Connector(ConnectorV2):
+    """ConnectorV2 piece preprocessing observations and rewards to be float16.
+    Note that users can also write a gymnasium.Wrapper for observations and rewards
+    to achieve the same thing.
+    """
+    def recompute_output_observation_space(
+        self,
+        input_observation_space,
+        input_action_space,
+    ):
+        return gym.spaces.Box(
+            input_observation_space.low.astype(np.float16),
+            input_observation_space.high.astype(np.float16),
+            input_observation_space.shape,
+            np.float16,
+        )
+    def __call__(self, *, rl_module, batch, episodes, **kwargs):
+        for sa_episode in self.single_agent_episode_iterator(episodes):
+            obs = sa_episode.get_observations(-1)
+            float16_obs = obs.astype(np.float16)
+            self.add_batch_item(
+                batch,
+                column="obs",
+                item_to_add=float16_obs,
+                single_agent_episode=sa_episode,
+            )
+        return batch
+class PPOTorchMixedPrecisionLearner(PPOTorchLearner):
+    def _update(self, *args, **kwargs):
+        with torch.cuda.amp.autocast():
+            results = super()._update(*args, **kwargs)
+        return results
+if __name__ == "__main__":
+    args = parser.parse_args()
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+    assert args.algo == "PPO", "Must set --algo=PPO when running this script!"
+    base_config = (
+        (PPOConfig().environment("CartPole-v1"))
+        .env_runners(env_to_module_connector=lambda env: Float16Connector())
+        # Plug in our custom callback (on_algorithm_init) to make EnvRunner RLModules
+        # float16 models.
+        .callbacks(on_algorithm_init=on_algorithm_init)
+        # Plug in the torch built-int loss scaler class to stabilize gradient
+        # computations (by scaling the loss, then unscaling the gradients before
+        # applying them). This is using the built-in, experimental feature of
+        # TorchLearner.
+        .experimental(_torch_grad_scaler_class=torch.cuda.amp.GradScaler)
+        .training(
+            # Plug in the custom Learner class to activate mixed-precision training for
+            # our torch RLModule (uses `torch.amp.autocast()`).
+            learner_class=PPOTorchMixedPrecisionLearner,
+            # Switch off grad clipping entirely b/c we use our custom grad scaler with
+            # built-in inf/nan detection (see `step` method of `Float16GradScaler`).
+            grad_clip=None,
+            # Typical CartPole-v1 hyperparams known to work well:
+            gamma=0.99,
+            lr=0.0003,
+            num_epochs=6,
+            vf_loss_coeff=0.01,
+            use_kl_loss=True,
+        )
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/__pycache__/ppo_load_rl_modules.cpython-311.pyc ADDED Viewed

Binary file (3.21 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/custom_ppo_loss_fn_learner.cpython-311.pyc ADDED Viewed

Binary file (3.08 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/separate_vf_lr_and_optimizer_learner.cpython-311.pyc ADDED Viewed

Binary file (4.06 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/examples/learners/classes/__pycache__/vpg_torch_learner.cpython-311.pyc ADDED Viewed

Binary file (4.63 kB). View file