koichi12 commited on Feb 12, 2025

Commit

ead37c9

verified ·

1 Parent(s): 5f20f96

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_torch.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/impala_torch_learner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/vtrace_torch_v2.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/impala_torch_learner.py +164 -0
.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/vtrace_torch_v2.py +168 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/centralized_critic.py +319 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py +157 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_in_sequence.py +87 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_simultaneously.py +108 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/async_gym_env_vectorization.py +142 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/action_mask_env.py +42 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_crashing.py +182 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_sparse_rewards.py +51 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_dict_observation_space.py +74 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_large_observation_space.py +69 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_protobuf_observation_space.py +79 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cliff_walking_wall_env.py +71 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/correlated_actions_env.py +79 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/d4rl_env.py +46 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/debug_counter_env.py +92 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/deterministic_envs.py +13 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/dm_control_suite.py +131 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_using_remote_actor.py +63 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_with_subprocess.py +42 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/fast_image_env.py +20 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/gpu_requiring_env.py +37 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/look_and_push.py +65 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/memory_leaking_env.py +35 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/mock_env.py +220 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_discrete.py +206 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/guess_the_number_game.py +89 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_chess.py +227 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_connect4.py +213 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/rock_paper_scissors.py +125 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/tic_tac_toe.py +144 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/two_step_game.py +123 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/nested_space_repeat_after_me_env.py +50 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/parametric_actions_cartpole.py +145 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/random_env.py +125 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/recommender_system_envs_with_recsim.py +108 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_after_me_env.py +47 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_initial_obs_env.py +32 -0
.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_corridor.py +42 -0

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (709 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (5.03 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_torch.cpython-311.pyc ADDED Viewed

Binary file (15 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (206 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/impala_torch_learner.cpython-311.pyc ADDED Viewed

Binary file (5.79 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/vtrace_torch_v2.cpython-311.pyc ADDED Viewed

Binary file (8.13 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/impala_torch_learner.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from typing import Dict
+from ray.rllib.algorithms.impala.impala import IMPALAConfig
+from ray.rllib.algorithms.impala.impala_learner import IMPALALearner
+from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import (
+    vtrace_torch,
+    make_time_major,
+)
+from ray.rllib.core.columns import Columns
+from ray.rllib.core.learner.learner import ENTROPY_KEY
+from ray.rllib.core.learner.torch.torch_learner import TorchLearner
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_torch
+from ray.rllib.utils.typing import ModuleID, TensorType
+torch, nn = try_import_torch()
+class IMPALATorchLearner(IMPALALearner, TorchLearner):
+    """Implements the IMPALA loss function in torch."""
+    @override(TorchLearner)
+    def compute_loss_for_module(
+        self,
+        *,
+        module_id: ModuleID,
+        config: IMPALAConfig,
+        batch: Dict,
+        fwd_out: Dict[str, TensorType],
+    ) -> TensorType:
+        module = self.module[module_id].unwrapped()
+        # TODO (sven): Now that we do the +1ts trick to be less vulnerable about
+        #  bootstrap values at the end of rollouts in the new stack, we might make
+        #  this a more flexible, configurable parameter for users, e.g.
+        #  `v_trace_seq_len` (independent of `rollout_fragment_length`). Separation
+        #  of concerns (sampling vs learning).
+        rollout_frag_or_episode_len = config.get_rollout_fragment_length()
+        recurrent_seq_len = batch.get("seq_lens")
+        loss_mask = batch[Columns.LOSS_MASK].float()
+        loss_mask_time_major = make_time_major(
+            loss_mask,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        size_loss_mask = torch.sum(loss_mask)
+        # Behavior actions logp and target actions logp.
+        behaviour_actions_logp = batch[Columns.ACTION_LOGP]
+        target_policy_dist = module.get_train_action_dist_cls().from_logits(
+            fwd_out[Columns.ACTION_DIST_INPUTS]
+        )
+        target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS])
+        # Values and bootstrap values.
+        values = module.compute_values(
+            batch, embeddings=fwd_out.get(Columns.EMBEDDINGS)
+        )
+        values_time_major = make_time_major(
+            values,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        assert Columns.VALUES_BOOTSTRAPPED not in batch
+        # Use as bootstrap values the vf-preds in the next "batch row", except
+        # for the very last row (which doesn't have a next row), for which the
+        # bootstrap value does not matter b/c it has a +1ts value at its end
+        # anyways. So we chose an arbitrary item (for simplicity of not having to
+        # move new data to the device).
+        bootstrap_values = torch.cat(
+            [
+                values_time_major[0][1:],  # 0th ts values from "next row"
+                values_time_major[0][0:1],  # <- can use any arbitrary value here
+            ],
+            dim=0,
+        )
+        # TODO(Artur): In the old impala code, actions were unsqueezed if they were
+        #  multi_discrete. Find out why and if we need to do the same here.
+        #  actions = actions if is_multidiscrete else torch.unsqueeze(actions, dim=1)
+        target_actions_logp_time_major = make_time_major(
+            target_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        behaviour_actions_logp_time_major = make_time_major(
+            behaviour_actions_logp,
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        rewards_time_major = make_time_major(
+            batch[Columns.REWARDS],
+            trajectory_len=rollout_frag_or_episode_len,
+            recurrent_seq_len=recurrent_seq_len,
+        )
+        # the discount factor that is used should be gamma except for timesteps where
+        # the episode is terminated. In that case, the discount factor should be 0.
+        discounts_time_major = (
+            1.0
+            - make_time_major(
+                batch[Columns.TERMINATEDS],
+                trajectory_len=rollout_frag_or_episode_len,
+                recurrent_seq_len=recurrent_seq_len,
+            ).type(dtype=torch.float32)
+        ) * config.gamma
+        # Note that vtrace will compute the main loop on the CPU for better performance.
+        vtrace_adjusted_target_values, pg_advantages = vtrace_torch(
+            target_action_log_probs=target_actions_logp_time_major,
+            behaviour_action_log_probs=behaviour_actions_logp_time_major,
+            discounts=discounts_time_major,
+            rewards=rewards_time_major,
+            values=values_time_major,
+            bootstrap_values=bootstrap_values,
+            clip_rho_threshold=config.vtrace_clip_rho_threshold,
+            clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold,
+        )
+        # The policy gradients loss.
+        pi_loss = -torch.sum(
+            target_actions_logp_time_major * pg_advantages * loss_mask_time_major
+        )
+        mean_pi_loss = pi_loss / size_loss_mask
+        # The baseline loss.
+        delta = values_time_major - vtrace_adjusted_target_values
+        vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major)
+        mean_vf_loss = vf_loss / size_loss_mask
+        # The entropy loss.
+        entropy_loss = -torch.sum(target_policy_dist.entropy() * loss_mask)
+        mean_entropy_loss = entropy_loss / size_loss_mask
+        # The summed weighted loss.
+        total_loss = (
+            mean_pi_loss
+            + mean_vf_loss * config.vf_loss_coeff
+            + (
+                mean_entropy_loss
+                * self.entropy_coeff_schedulers_per_module[
+                    module_id
+                ].get_current_value()
+            )
+        )
+        # Log important loss stats.
+        self.metrics.log_dict(
+            {
+                "pi_loss": pi_loss,
+                "mean_pi_loss": mean_pi_loss,
+                "vf_loss": vf_loss,
+                "mean_vf_loss": mean_vf_loss,
+                ENTROPY_KEY: -mean_entropy_loss,
+            },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
+        # Return the total loss.
+        return total_loss
+ImpalaTorchLearner = IMPALATorchLearner

.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/vtrace_torch_v2.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from typing import List, Union
+from ray.rllib.utils.framework import try_import_torch
+torch, nn = try_import_torch()
+def make_time_major(
+    tensor: Union["torch.Tensor", List["torch.Tensor"]],
+    *,
+    trajectory_len: int = None,
+    recurrent_seq_len: int = None,
+):
+    """Swaps batch and trajectory axis.
+    Args:
+        tensor: A tensor or list of tensors to swap the axis of.
+            NOTE: Each tensor must have the shape [B * T] where B is the batch size and
+            T is the trajectory length.
+        trajectory_len: The length of each trajectory being transformed.
+            If None then `recurrent_seq_len` must be set.
+        recurrent_seq_len: Sequence lengths if recurrent.
+            If None then `trajectory_len` must be set.
+    Returns:
+        res: A tensor with swapped axes or a list of tensors with
+        swapped axes.
+    """
+    if isinstance(tensor, (list, tuple)):
+        return [
+            make_time_major(_tensor, trajectory_len, recurrent_seq_len)
+            for _tensor in tensor
+        ]
+    assert (
+        trajectory_len is not None or recurrent_seq_len is not None
+    ), "Either trajectory_len or recurrent_seq_len must be set."
+    # Figure out the sizes of the final B and T axes.
+    if recurrent_seq_len is not None:
+        assert len(tensor.shape) == 2
+        # Swap B and T axes.
+        tensor = torch.transpose(tensor, 1, 0)
+        return tensor
+    else:
+        T = trajectory_len
+        # Zero-pad, if necessary.
+        tensor_0 = tensor.shape[0]
+        B = tensor_0 // T
+        if B != (tensor_0 / T):
+            assert len(tensor.shape) == 1
+            tensor = torch.cat(
+                [
+                    tensor,
+                    torch.zeros(
+                        trajectory_len - tensor_0 % T,
+                        dtype=tensor.dtype,
+                        device=tensor.device,
+                    ),
+                ]
+            )
+            B += 1
+    # Reshape tensor (break up B axis into 2 axes: B and T).
+    tensor = torch.reshape(tensor, [B, T] + list(tensor.shape[1:]))
+    # Swap B and T axes.
+    tensor = torch.transpose(tensor, 1, 0)
+    return tensor
+def vtrace_torch(
+    *,
+    target_action_log_probs: "torch.Tensor",
+    behaviour_action_log_probs: "torch.Tensor",
+    discounts: "torch.Tensor",
+    rewards: "torch.Tensor",
+    values: "torch.Tensor",
+    bootstrap_values: "torch.Tensor",
+    clip_rho_threshold: Union[float, "torch.Tensor"] = 1.0,
+    clip_pg_rho_threshold: Union[float, "torch.Tensor"] = 1.0,
+):
+    """V-trace for softmax policies implemented with torch.
+    Calculates V-trace actor critic targets for softmax polices as described in
+    "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner
+    Architectures" by Espeholt, Soyer, Munos et al. (https://arxiv.org/abs/1802.01561)
+    The V-trace implementation used here closely resembles the one found in the
+    scalable-agent repository by Google DeepMind, available at
+    https://github.com/deepmind/scalable_agent. This version has been optimized to
+    minimize the number of floating-point operations required per V-Trace
+    calculation, achieved through the use of dynamic programming techniques. It's
+    important to note that the mathematical expressions used in this implementation
+    may appear quite different from those presented in the IMPALA paper.
+    The following terminology applies:
+        - `target policy` refers to the policy we are interested in improving.
+        - `behaviour policy` refers to the policy that generated the given
+            rewards and actions.
+        - `T` refers to the time dimension. This is usually either the length of the
+            trajectory or the length of the sequence if recurrent.
+        - `B` refers to the batch size.
+    Args:
+        target_action_log_probs: Action log probs from the target policy. A float32
+            tensor of shape [T, B].
+        behaviour_action_log_probs: Action log probs from the behaviour policy. A
+            float32 tensor of shape [T, B].
+        discounts: A float32 tensor of shape [T, B] with the discount encountered when
+            following the behaviour policy. This will be 0 for terminal timesteps
+            (done=True) and gamma (the discount factor) otherwise.
+        rewards: A float32 tensor of shape [T, B] with the rewards generated by
+            following the behaviour policy.
+        values: A float32 tensor of shape [T, B] with the value function estimates
+            wrt. the target policy.
+        bootstrap_values: A float32 of shape [B] with the value function estimate at
+            time T.
+        clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
+            importance weights (rho) when calculating the baseline targets (vs).
+            rho^bar in the paper.
+        clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
+            on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
+    """
+    log_rhos = target_action_log_probs - behaviour_action_log_probs
+    rhos = torch.exp(log_rhos)
+    if clip_rho_threshold is not None:
+        clipped_rhos = torch.clamp(rhos, max=clip_rho_threshold)
+    else:
+        clipped_rhos = rhos
+    cs = torch.clamp(rhos, max=1.0)
+    # Append bootstrapped value to get [v1, ..., v_t+1]
+    values_t_plus_1 = torch.cat(
+        [values[1:], torch.unsqueeze(bootstrap_values, 0)], axis=0
+    )
+    deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)
+    # Only move the for-loop to CPU.
+    discounts_cpu = discounts.to("cpu")
+    cs_cpu = cs.to("cpu")
+    deltas_cpu = deltas.to("cpu")
+    vs_minus_v_xs_cpu = [torch.zeros_like(bootstrap_values, device="cpu")]
+    for i in reversed(range(len(discounts_cpu))):
+        discount_t, c_t, delta_t = discounts_cpu[i], cs_cpu[i], deltas_cpu[i]
+        vs_minus_v_xs_cpu.append(delta_t + discount_t * c_t * vs_minus_v_xs_cpu[-1])
+    vs_minus_v_xs_cpu = torch.stack(vs_minus_v_xs_cpu[1:])
+    # Move results back to GPU - if applicable.
+    vs_minus_v_xs = vs_minus_v_xs_cpu.to(deltas.device)
+    # Reverse the results back to original order.
+    vs_minus_v_xs = torch.flip(vs_minus_v_xs, dims=[0])
+    # Add V(x_s) to get v_s.
+    vs = torch.add(vs_minus_v_xs, values)
+    # Advantage for policy gradient.
+    vs_t_plus_1 = torch.cat([vs[1:], torch.unsqueeze(bootstrap_values, 0)], axis=0)
+    if clip_pg_rho_threshold is not None:
+        clipped_pg_rhos = torch.clamp(rhos, max=clip_pg_rho_threshold)
+    else:
+        clipped_pg_rhos = rhos
+    pg_advantages = clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values)
+    # Make sure no gradients backpropagated through the returned values.
+    return torch.detach(vs), torch.detach(pg_advantages)

.venv/lib/python3.11/site-packages/ray/rllib/examples/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/centralized_critic.py ADDED Viewed

	@@ -0,0 +1,319 @@

+# @OldAPIStack
+# ***********************************************************************************
+# IMPORTANT NOTE: This script uses the old API stack and will soon be replaced by
+# `ray.rllib.examples.multi_agent.pettingzoo_shared_value_function.py`!
+# ***********************************************************************************
+"""An example of customizing PPO to leverage a centralized critic.
+Here the model and policy are hard-coded to implement a centralized critic
+for TwoStepGame, but you can adapt this for your own use cases.
+Compared to simply running `rllib/examples/two_step_game.py --run=PPO`,
+this centralized critic version reaches vf_explained_variance=1.0 more stably
+since it takes into account the opponent actions as well as the policy's.
+Note that this is also using two independent policies instead of weight-sharing
+with one.
+See also: centralized_critic_2.py for a simpler approach that instead
+modifies the environment.
+"""
+import argparse
+from gymnasium.spaces import Discrete
+import numpy as np
+import os
+import ray
+from ray import air, tune
+from ray.air.constants import TRAINING_ITERATION
+from ray.rllib.algorithms.ppo.ppo import PPO, PPOConfig
+from ray.rllib.algorithms.ppo.ppo_tf_policy import (
+    PPOTF1Policy,
+    PPOTF2Policy,
+)
+from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy
+from ray.rllib.evaluation.postprocessing import compute_advantages, Postprocessing
+from ray.rllib.examples.envs.classes.multi_agent.two_step_game import TwoStepGame
+from ray.rllib.examples._old_api_stack.models.centralized_critic_models import (
+    CentralizedCriticModel,
+    TorchCentralizedCriticModel,
+)
+from ray.rllib.models import ModelCatalog
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.metrics import (
+    ENV_RUNNER_RESULTS,
+    EPISODE_RETURN_MEAN,
+    NUM_ENV_STEPS_SAMPLED_LIFETIME,
+)
+from ray.rllib.utils.numpy import convert_to_numpy
+from ray.rllib.utils.test_utils import check_learning_achieved
+from ray.rllib.utils.tf_utils import explained_variance, make_tf_callable
+from ray.rllib.utils.torch_utils import convert_to_torch_tensor
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+OPPONENT_OBS = "opponent_obs"
+OPPONENT_ACTION = "opponent_action"
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "torch"],
+    default="torch",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=100, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
+)
+parser.add_argument(
+    "--stop-reward", type=float, default=7.99, help="Reward at which we stop training."
+)
+class CentralizedValueMixin:
+    """Add method to evaluate the central value function from the model."""
+    def __init__(self):
+        if self.config["framework"] != "torch":
+            self.compute_central_vf = make_tf_callable(self.get_session())(
+                self.model.central_value_function
+            )
+        else:
+            self.compute_central_vf = self.model.central_value_function
+# Grabs the opponent obs/act and includes it in the experience train_batch,
+# and computes GAE using the central vf predictions.
+def centralized_critic_postprocessing(
+    policy, sample_batch, other_agent_batches=None, episode=None
+):
+    pytorch = policy.config["framework"] == "torch"
+    if (pytorch and hasattr(policy, "compute_central_vf")) or (
+        not pytorch and policy.loss_initialized()
+    ):
+        assert other_agent_batches is not None
+        [(_, _, opponent_batch)] = list(other_agent_batches.values())
+        # also record the opponent obs and actions in the trajectory
+        sample_batch[OPPONENT_OBS] = opponent_batch[SampleBatch.CUR_OBS]
+        sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS]
+        # overwrite default VF prediction with the central VF
+        if args.framework == "torch":
+            sample_batch[SampleBatch.VF_PREDS] = (
+                policy.compute_central_vf(
+                    convert_to_torch_tensor(
+                        sample_batch[SampleBatch.CUR_OBS], policy.device
+                    ),
+                    convert_to_torch_tensor(sample_batch[OPPONENT_OBS], policy.device),
+                    convert_to_torch_tensor(
+                        sample_batch[OPPONENT_ACTION], policy.device
+                    ),
+                )
+                .cpu()
+                .detach()
+                .numpy()
+            )
+        else:
+            sample_batch[SampleBatch.VF_PREDS] = convert_to_numpy(
+                policy.compute_central_vf(
+                    sample_batch[SampleBatch.CUR_OBS],
+                    sample_batch[OPPONENT_OBS],
+                    sample_batch[OPPONENT_ACTION],
+                )
+            )
+    else:
+        # Policy hasn't been initialized yet, use zeros.
+        sample_batch[OPPONENT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS])
+        sample_batch[OPPONENT_ACTION] = np.zeros_like(sample_batch[SampleBatch.ACTIONS])
+        sample_batch[SampleBatch.VF_PREDS] = np.zeros_like(
+            sample_batch[SampleBatch.REWARDS], dtype=np.float32
+        )
+    completed = sample_batch[SampleBatch.TERMINATEDS][-1]
+    if completed:
+        last_r = 0.0
+    else:
+        last_r = sample_batch[SampleBatch.VF_PREDS][-1]
+    train_batch = compute_advantages(
+        sample_batch,
+        last_r,
+        policy.config["gamma"],
+        policy.config["lambda"],
+        use_gae=policy.config["use_gae"],
+    )
+    return train_batch
+# Copied from PPO but optimizing the central value function.
+def loss_with_central_critic(policy, base_policy, model, dist_class, train_batch):
+    # Save original value function.
+    vf_saved = model.value_function
+    # Calculate loss with a custom value function.
+    model.value_function = lambda: policy.model.central_value_function(
+        train_batch[SampleBatch.CUR_OBS],
+        train_batch[OPPONENT_OBS],
+        train_batch[OPPONENT_ACTION],
+    )
+    policy._central_value_out = model.value_function()
+    loss = base_policy.loss(model, dist_class, train_batch)
+    # Restore original value function.
+    model.value_function = vf_saved
+    return loss
+def central_vf_stats(policy, train_batch):
+    # Report the explained variance of the central value function.
+    return {
+        "vf_explained_var": explained_variance(
+            train_batch[Postprocessing.VALUE_TARGETS], policy._central_value_out
+        )
+    }
+def get_ccppo_policy(base):
+    class CCPPOTFPolicy(CentralizedValueMixin, base):
+        def __init__(self, observation_space, action_space, config):
+            base.__init__(self, observation_space, action_space, config)
+            CentralizedValueMixin.__init__(self)
+        @override(base)
+        def loss(self, model, dist_class, train_batch):
+            # Use super() to get to the base PPO policy.
+            # This special loss function utilizes a shared
+            # value function defined on self, and the loss function
+            # defined on PPO policies.
+            return loss_with_central_critic(
+                self, super(), model, dist_class, train_batch
+            )
+        @override(base)
+        def postprocess_trajectory(
+            self, sample_batch, other_agent_batches=None, episode=None
+        ):
+            return centralized_critic_postprocessing(
+                self, sample_batch, other_agent_batches, episode
+            )
+        @override(base)
+        def stats_fn(self, train_batch: SampleBatch):
+            stats = super().stats_fn(train_batch)
+            stats.update(central_vf_stats(self, train_batch))
+            return stats
+    return CCPPOTFPolicy
+CCPPOStaticGraphTFPolicy = get_ccppo_policy(PPOTF1Policy)
+CCPPOEagerTFPolicy = get_ccppo_policy(PPOTF2Policy)
+class CCPPOTorchPolicy(CentralizedValueMixin, PPOTorchPolicy):
+    def __init__(self, observation_space, action_space, config):
+        PPOTorchPolicy.__init__(self, observation_space, action_space, config)
+        CentralizedValueMixin.__init__(self)
+    @override(PPOTorchPolicy)
+    def loss(self, model, dist_class, train_batch):
+        return loss_with_central_critic(self, super(), model, dist_class, train_batch)
+    @override(PPOTorchPolicy)
+    def postprocess_trajectory(
+        self, sample_batch, other_agent_batches=None, episode=None
+    ):
+        return centralized_critic_postprocessing(
+            self, sample_batch, other_agent_batches, episode
+        )
+class CentralizedCritic(PPO):
+    @classmethod
+    @override(PPO)
+    def get_default_policy_class(cls, config):
+        if config["framework"] == "torch":
+            return CCPPOTorchPolicy
+        elif config["framework"] == "tf":
+            return CCPPOStaticGraphTFPolicy
+        else:
+            return CCPPOEagerTFPolicy
+if __name__ == "__main__":
+    ray.init(local_mode=True)
+    args = parser.parse_args()
+    ModelCatalog.register_custom_model(
+        "cc_model",
+        TorchCentralizedCriticModel
+        if args.framework == "torch"
+        else CentralizedCriticModel,
+    )
+    config = (
+        PPOConfig()
+        .api_stack(
+            enable_env_runner_and_connector_v2=False,
+            enable_rl_module_and_learner=False,
+        )
+        .environment(TwoStepGame)
+        .framework(args.framework)
+        .env_runners(batch_mode="complete_episodes", num_env_runners=0)
+        .training(model={"custom_model": "cc_model"})
+        .multi_agent(
+            policies={
+                "pol1": (
+                    None,
+                    Discrete(6),
+                    TwoStepGame.action_space,
+                    # `framework` would also be ok here.
+                    PPOConfig.overrides(framework_str=args.framework),
+                ),
+                "pol2": (
+                    None,
+                    Discrete(6),
+                    TwoStepGame.action_space,
+                    # `framework` would also be ok here.
+                    PPOConfig.overrides(framework_str=args.framework),
+                ),
+            },
+            policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "pol1"
+            if agent_id == 0
+            else "pol2",
+        )
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
+    )
+    stop = {
+        TRAINING_ITERATION: args.stop_iters,
+        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
+        f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward,
+    }
+    tuner = tune.Tuner(
+        CentralizedCritic,
+        param_space=config.to_dict(),
+        run_config=air.RunConfig(stop=stop, verbose=1),
+    )
+    results = tuner.fit()
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)

.venv/lib/python3.11/site-packages/ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# @OldAPIStack
+"""
+Adapted (time-dependent) GAE for PPO algorithm that you can activate by setting
+use_adapted_gae=True in the policy config. Additionally, it's required that
+"callbacks" include the custom callback class in the Algorithm's config.
+Furthermore, the env must return in its info dictionary a key-value pair of
+the form "d_ts": ... where the value is the length (time) of recent agent step.
+This adapted, time-dependent computation of advantages may be useful in cases
+where agent's actions take various times and thus time steps are not
+equidistant (https://docdro.id/400TvlR)
+"""
+from ray.rllib.callbacks.callbacks import RLlibCallback
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.utils.annotations import override
+import numpy as np
+class MyCallbacks(RLlibCallback):
+    @override(RLlibCallback)
+    def on_postprocess_trajectory(
+        self,
+        *,
+        worker,
+        episode,
+        agent_id,
+        policy_id,
+        policies,
+        postprocessed_batch,
+        original_batches,
+        **kwargs
+    ):
+        super().on_postprocess_trajectory(
+            worker=worker,
+            episode=episode,
+            agent_id=agent_id,
+            policy_id=policy_id,
+            policies=policies,
+            postprocessed_batch=postprocessed_batch,
+            original_batches=original_batches,
+            **kwargs
+        )
+        if policies[policy_id].config.get("use_adapted_gae", False):
+            policy = policies[policy_id]
+            assert policy.config[
+                "use_gae"
+            ], "Can't use adapted gae without use_gae=True!"
+            info_dicts = postprocessed_batch[SampleBatch.INFOS]
+            assert np.all(
+                ["d_ts" in info_dict for info_dict in info_dicts]
+            ), "Info dicts in sample batch must contain data 'd_ts' \
+                (=ts[i+1]-ts[i] length of time steps)!"
+            d_ts = np.array(
+                [np.float(info_dict.get("d_ts")) for info_dict in info_dicts]
+            )
+            assert np.all(
+                [e.is_integer() for e in d_ts]
+            ), "Elements of 'd_ts' (length of time steps) must be integer!"
+            # Trajectory is actually complete -> last r=0.0.
+            if postprocessed_batch[SampleBatch.TERMINATEDS][-1]:
+                last_r = 0.0
+            # Trajectory has been truncated -> last r=VF estimate of last obs.
+            else:
+                # Input dict is provided to us automatically via the Model's
+                # requirements. It's a single-timestep (last one in trajectory)
+                # input_dict.
+                # Create an input dict according to the Model's requirements.
+                input_dict = postprocessed_batch.get_single_step_input_dict(
+                    policy.model.view_requirements, index="last"
+                )
+                last_r = policy._value(**input_dict)
+            gamma = policy.config["gamma"]
+            lambda_ = policy.config["lambda"]
+            vpred_t = np.concatenate(
+                [postprocessed_batch[SampleBatch.VF_PREDS], np.array([last_r])]
+            )
+            delta_t = (
+                postprocessed_batch[SampleBatch.REWARDS]
+                + gamma**d_ts * vpred_t[1:]
+                - vpred_t[:-1]
+            )
+            # This formula for the advantage is an adaption of
+            # "Generalized Advantage Estimation"
+            # (https://arxiv.org/abs/1506.02438) which accounts for time steps
+            # of irregular length (see proposal here ).
+            # NOTE: last time step delta is not required
+            postprocessed_batch[
+                Postprocessing.ADVANTAGES
+            ] = generalized_discount_cumsum(delta_t, d_ts[:-1], gamma * lambda_)
+            postprocessed_batch[Postprocessing.VALUE_TARGETS] = (
+                postprocessed_batch[Postprocessing.ADVANTAGES]
+                + postprocessed_batch[SampleBatch.VF_PREDS]
+            ).astype(np.float32)
+            postprocessed_batch[Postprocessing.ADVANTAGES] = postprocessed_batch[
+                Postprocessing.ADVANTAGES
+            ].astype(np.float32)
+def generalized_discount_cumsum(
+    x: np.ndarray, deltas: np.ndarray, gamma: float
+) -> np.ndarray:
+    """Calculates the 'time-dependent' discounted cumulative sum over a
+    (reward) sequence `x`.
+    Recursive equations:
+    y[t] - gamma**deltas[t+1]*y[t+1] = x[t]
+    reversed(y)[t] - gamma**reversed(deltas)[t-1]*reversed(y)[t-1] =
+    reversed(x)[t]
+    Args:
+        x (np.ndarray): A sequence of rewards or one-step TD residuals.
+        deltas (np.ndarray): A sequence of time step deltas (length of time
+            steps).
+        gamma: The discount factor gamma.
+    Returns:
+        np.ndarray: The sequence containing the 'time-dependent' discounted
+            cumulative sums for each individual element in `x` till the end of
+            the trajectory.
+    .. testcode::
+        :skipif: True
+        x = np.array([0.0, 1.0, 2.0, 3.0])
+        deltas = np.array([1.0, 4.0, 15.0])
+        gamma = 0.9
+        generalized_discount_cumsum(x, deltas, gamma)
+    .. testoutput::
+        array([0.0 + 0.9^1.0*1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
+               1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
+               2.0 + 0.9^15.0*3.0,
+               3.0])
+    """
+    reversed_x = x[::-1]
+    reversed_deltas = deltas[::-1]
+    reversed_y = np.empty_like(x)
+    reversed_y[0] = reversed_x[0]
+    for i in range(1, x.size):
+        reversed_y[i] = (
+            reversed_x[i] + gamma ** reversed_deltas[i - 1] * reversed_y[i - 1]
+        )
+    return reversed_y[::-1]

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_in_sequence.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""Example of running a multi-agent experiment w/ agents taking turns (sequence).
+This example:
+    - demonstrates how to write your own (multi-agent) environment using RLlib's
+    MultiAgentEnv API.
+    - shows how to implement the `reset()` and `step()` methods of the env such that
+    the agents act in a fixed sequence (taking turns).
+    - shows how to configure and setup this environment class within an RLlib
+    Algorithm config.
+    - runs the experiment with the configured algo, trying to solve the environment.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack`
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+You should see results similar to the following in your console output:
++---------------------------+----------+--------+------------------+--------+
+| Trial name                | status   |   iter |   total time (s) |     ts |
+|---------------------------+----------+--------+------------------+--------+
+| PPO_TicTacToe_957aa_00000 | RUNNING  |     25 |          96.7452 | 100000 |
++---------------------------+----------+--------+------------------+--------+
++-------------------+------------------+------------------+
+|   combined return |   return player2 |   return player1 |
+|-------------------+------------------+------------------|
+|                -2 |             1.15 |            -0.85 |
++-------------------+------------------+------------------+
+Note that even though we are playing a zero-sum game, the overall return should start
+at some negative values due to the misplacement penalty of our (simplified) TicTacToe
+game.
+"""
+from ray.rllib.examples.envs.classes.multi_agent.tic_tac_toe import TicTacToe
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env  # noqa
+parser = add_rllib_example_script_args(
+    default_reward=-4.0, default_iters=50, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    assert args.num_agents == 2, "Must set --num-agents=2 when running this script!"
+    # You can also register the env creator function explicitly with:
+    # register_env("tic_tac_toe", lambda cfg: TicTacToe())
+    # Or allow the RLlib user to set more c'tor options via their algo config:
+    # config.environment(env_config={[c'tor arg name]: [value]})
+    # register_env("tic_tac_toe", lambda cfg: TicTacToe(cfg))
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(TicTacToe)
+        .multi_agent(
+            # Define two policies.
+            policies={"player1", "player2"},
+            # Map agent "player1" to policy "player1" and agent "player2" to policy
+            # "player2".
+            policy_mapping_fn=lambda agent_id, episode, **kw: agent_id,
+        )
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_simultaneously.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Example of running a multi-agent experiment w/ agents always acting simultaneously.
+This example:
+    - demonstrates how to write your own (multi-agent) environment using RLlib's
+    MultiAgentEnv API.
+    - shows how to implement the `reset()` and `step()` methods of the env such that
+    the agents act simultaneously.
+    - shows how to configure and setup this environment class within an RLlib
+    Algorithm config.
+    - runs the experiment with the configured algo, trying to solve the environment.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --sheldon-cooper-mode`
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+You should see results similar to the following in your console output:
++-----------------------------------+----------+--------+------------------+-------+
+| Trial name                        | status   |   iter |   total time (s) |    ts |
+|-----------------------------------+----------+--------+------------------+-------+
+| PPO_RockPaperScissors_8cef7_00000 | RUNNING  |      3 |          16.5348 | 12000 |
++-----------------------------------+----------+--------+------------------+-------+
++-------------------+------------------+------------------+
+|   combined return |   return player2 |   return player1 |
+|-------------------+------------------+------------------|
+|                 0 |            -0.15 |             0.15 |
++-------------------+------------------+------------------+
+Note that b/c we are playing a zero-sum game, the overall return remains 0.0 at
+all times.
+"""
+from ray.rllib.examples.envs.classes.multi_agent.rock_paper_scissors import (
+    RockPaperScissors,
+)
+from ray.rllib.connectors.env_to_module.flatten_observations import FlattenObservations
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls, register_env  # noqa
+parser = add_rllib_example_script_args(
+    default_reward=0.9, default_iters=50, default_timesteps=100000
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    num_agents=2,
+)
+parser.add_argument(
+    "--sheldon-cooper-mode",
+    action="store_true",
+    help="Whether to add two more actions to the game: Lizard and Spock. "
+    "Watch here for more details :) https://www.youtube.com/watch?v=x5Q6-wMx-K8",
+)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    assert args.num_agents == 2, "Must set --num-agents=2 when running this script!"
+    # You can also register the env creator function explicitly with:
+    # register_env("env", lambda cfg: RockPaperScissors({"sheldon_cooper_mode": False}))
+    # Or you can hard code certain settings into the Env's constructor (`config`).
+    # register_env(
+    #    "rock-paper-scissors-w-sheldon-mode-activated",
+    #    lambda config: RockPaperScissors({**config, **{"sheldon_cooper_mode": True}}),
+    # )
+    # Or allow the RLlib user to set more c'tor options via their algo config:
+    # config.environment(env_config={[c'tor arg name]: [value]})
+    # register_env("rock-paper-scissors", lambda cfg: RockPaperScissors(cfg))
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            RockPaperScissors,
+            env_config={"sheldon_cooper_mode": args.sheldon_cooper_mode},
+        )
+        .env_runners(
+            env_to_module_connector=lambda env: FlattenObservations(multi_agent=True),
+        )
+        .multi_agent(
+            # Define two policies.
+            policies={"player1", "player2"},
+            # Map agent "player1" to policy "player1" and agent "player2" to policy
+            # "player2".
+            policy_mapping_fn=lambda agent_id, episode, **kw: agent_id,
+        )
+    )
+    run_rllib_example_script_experiment(base_config, args)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/async_gym_env_vectorization.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Example demo'ing async gym vector envs, in which sub-envs have their own process.
+Setting up env vectorization works through setting the `config.num_envs_per_env_runner`
+value to > 1. However, by default the n sub-environments are stepped through
+sequentially, rather than in parallel.
+This script shows the effect of setting the `config.gym_env_vectorize_mode` from its
+default value of "SYNC" (all sub envs are located in the same EnvRunner process)
+to "ASYNC" (all sub envs in each EnvRunner get their own process).
+This example:
+    - shows, which config settings to change in order to switch from sub-envs being
+    stepped in sequence to each sub-envs owning its own process (and compute resource)
+    and thus the vector being stepped in parallel.
+    - shows, how this setup can increase EnvRunner performance significantly, especially
+    for heavier, slower environments.
+    - uses an artificially slow CartPole-v1 environment for demonstration purposes.
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack `
+Use the `--vectorize-mode=BOTH` option to run both modes (SYNC and ASYNC)
+through Tune at the same time and get a better comparison of the throughputs
+achieved.
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+Results to expect
+-----------------
+You should see results similar to the following in your console output
+when using the
++--------------------------+------------+------------------------+------+
+| Trial name               | status     | gym_env_vectorize_mode | iter |
+|                          |            |                        |      |
+|--------------------------+------------+------------------------+------+
+| PPO_slow-env_6ddf4_00000 | TERMINATED | SYNC                   |    4 |
+| PPO_slow-env_6ddf4_00001 | TERMINATED | ASYNC                  |    4 |
++--------------------------+------------+------------------------+------+
++------------------+----------------------+------------------------+
+|   total time (s) |  episode_return_mean |   num_env_steps_sample |
+|                  |                      |             d_lifetime |
+|------------------+----------------------+------------------------+
+|          60.8794 |                73.53 |                  16040 |
+|          19.1203 |                73.86 |                  16037 |
++------------------+----------------------+------------------------+
+You can see that the ASYNC mode, given that the env is sufficiently slow,
+achieves much better results when using vectorization.
+You should see no difference, however, when only using
+`--num-envs-per-env-runner=1`.
+"""
+import time
+import gymnasium as gym
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray import tune
+parser = add_rllib_example_script_args(default_reward=60.0)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    env="CartPole-v1",
+    num_envs_per_env_runner=6,
+)
+parser.add_argument(
+    "--vectorize-mode",
+    type=str,
+    default="ASYNC",
+    help="The value `gym.envs.registration.VectorizeMode` to use for env "
+    "vectorization. SYNC steps through all sub-envs in sequence. ASYNC (default) "
+    "parallelizes sub-envs through multiprocessing and can speed up EnvRunners "
+    "significantly. Use the special value `BOTH` to run both ASYNC and SYNC through a "
+    "Tune grid-search.",
+)
+class SlowEnv(gym.ObservationWrapper):
+    def observation(self, observation):
+        time.sleep(0.005)
+        return observation
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.no_tune and args.vectorize_mode == "BOTH":
+        raise ValueError(
+            "Can't run this script with both --no-tune and --vectorize-mode=BOTH!"
+        )
+    # Wrap the env with the slowness wrapper.
+    def _env_creator(cfg):
+        return SlowEnv(gym.make(args.env, **cfg))
+    tune.register_env("slow-env", _env_creator)
+    if args.vectorize_mode == "BOTH" and args.no_tune:
+        raise ValueError(
+            "`--vectorize-mode=BOTH` and `--no-tune` not allowed in combination!"
+        )
+    base_config = (
+        PPOConfig()
+        .environment("slow-env")
+        .env_runners(
+            gym_env_vectorize_mode=(
+                tune.grid_search(["SYNC", "ASYNC"])
+                if args.vectorize_mode == "BOTH"
+                else args.vectorize_mode
+            ),
+        )
+    )
+    results = run_rllib_example_script_experiment(base_config, args)
+    # Compare the throughputs and assert that ASYNC is much faster than SYNC.
+    if args.vectorize_mode == "BOTH":
+        throughput_sync = (
+            results[0].metrics["num_env_steps_sampled_lifetime"]
+            / results[0].metrics["time_total_s"]
+        )
+        throughput_async = (
+            results[1].metrics["num_env_steps_sampled_lifetime"]
+            / results[1].metrics["time_total_s"]
+        )
+        assert throughput_async > throughput_sync

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/action_mask_env.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from gymnasium.spaces import Box, Dict, Discrete
+import numpy as np
+from ray.rllib.examples.envs.classes.random_env import RandomEnv
+class ActionMaskEnv(RandomEnv):
+    """A randomly acting environment that publishes an action-mask each step."""
+    def __init__(self, config):
+        super().__init__(config)
+        # Masking only works for Discrete actions.
+        assert isinstance(self.action_space, Discrete)
+        # Add action_mask to observations.
+        self.observation_space = Dict(
+            {
+                "action_mask": Box(0.0, 1.0, shape=(self.action_space.n,)),
+                "observations": self.observation_space,
+            }
+        )
+        self.valid_actions = None
+    def reset(self, *, seed=None, options=None):
+        obs, info = super().reset()
+        self._fix_action_mask(obs)
+        return obs, info
+    def step(self, action):
+        # Check whether action is valid.
+        if not self.valid_actions[action]:
+            raise ValueError(
+                f"Invalid action ({action}) sent to env! "
+                f"valid_actions={self.valid_actions}"
+            )
+        obs, rew, done, truncated, info = super().step(action)
+        self._fix_action_mask(obs)
+        return obs, rew, done, truncated, info
+    def _fix_action_mask(self, obs):
+        # Fix action-mask: Everything larger 0.5 is 1.0, everything else 0.0.
+        self.valid_actions = np.round(obs["action_mask"])
+        obs["action_mask"] = self.valid_actions

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_crashing.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import logging
+from gymnasium.envs.classic_control import CartPoleEnv
+import numpy as np
+import time
+from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.error import EnvError
+logger = logging.getLogger(__name__)
+class CartPoleCrashing(CartPoleEnv):
+    """A CartPole env that crashes (or stalls) from time to time.
+    Useful for testing faulty sub-env (within a vectorized env) handling by
+    EnvRunners.
+    After crashing, the env expects a `reset()` call next (calling `step()` will
+    result in yet another error), which may or may not take a very long time to
+    complete. This simulates the env having to reinitialize some sub-processes, e.g.
+    an external connection.
+    The env can also be configured to stall (and do nothing during a call to `step()`)
+    from time to time for a configurable amount of time.
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        self.config = config if config is not None else {}
+        # Crash probability (in each `step()`).
+        self.p_crash = config.get("p_crash", 0.005)
+        # Crash probability when `reset()` is called.
+        self.p_crash_reset = config.get("p_crash_reset", 0.0)
+        # Crash exactly after every n steps. If a 2-tuple, will uniformly sample
+        # crash timesteps from in between the two given values.
+        self.crash_after_n_steps = config.get("crash_after_n_steps")
+        self._crash_after_n_steps = None
+        assert (
+            self.crash_after_n_steps is None
+            or isinstance(self.crash_after_n_steps, int)
+            or (
+                isinstance(self.crash_after_n_steps, tuple)
+                and len(self.crash_after_n_steps) == 2
+            )
+        )
+        # Only ever crash, if on certain worker indices.
+        faulty_indices = config.get("crash_on_worker_indices", None)
+        if faulty_indices and config.worker_index not in faulty_indices:
+            self.p_crash = 0.0
+            self.p_crash_reset = 0.0
+            self.crash_after_n_steps = None
+        # Stall probability (in each `step()`).
+        self.p_stall = config.get("p_stall", 0.0)
+        # Stall probability when `reset()` is called.
+        self.p_stall_reset = config.get("p_stall_reset", 0.0)
+        # Stall exactly after every n steps.
+        self.stall_after_n_steps = config.get("stall_after_n_steps")
+        self._stall_after_n_steps = None
+        # Amount of time to stall. If a 2-tuple, will uniformly sample from in between
+        # the two given values.
+        self.stall_time_sec = config.get("stall_time_sec")
+        assert (
+            self.stall_time_sec is None
+            or isinstance(self.stall_time_sec, (int, float))
+            or (
+                isinstance(self.stall_time_sec, tuple) and len(self.stall_time_sec) == 2
+            )
+        )
+        # Only ever stall, if on certain worker indices.
+        faulty_indices = config.get("stall_on_worker_indices", None)
+        if faulty_indices and config.worker_index not in faulty_indices:
+            self.p_stall = 0.0
+            self.p_stall_reset = 0.0
+            self.stall_after_n_steps = None
+        # Timestep counter for the ongoing episode.
+        self.timesteps = 0
+        # Time in seconds to initialize (in this c'tor).
+        sample = 0.0
+        if "init_time_s" in config:
+            sample = (
+                config["init_time_s"]
+                if not isinstance(config["init_time_s"], tuple)
+                else np.random.uniform(
+                    config["init_time_s"][0], config["init_time_s"][1]
+                )
+            )
+        print(f"Initializing crashing env (with init-delay of {sample}sec) ...")
+        time.sleep(sample)
+        # Make sure envs don't crash at the same time.
+        self._rng = np.random.RandomState()
+    @override(CartPoleEnv)
+    def reset(self, *, seed=None, options=None):
+        # Reset timestep counter for the new episode.
+        self.timesteps = 0
+        self._crash_after_n_steps = None
+        # Should we crash?
+        if self._should_crash(p=self.p_crash_reset):
+            raise EnvError(
+                f"Simulated env crash on worker={self.config.worker_index} "
+                f"env-idx={self.config.vector_index} during `reset()`! "
+                "Feel free to use any other exception type here instead."
+            )
+        # Should we stall for a while?
+        self._stall_if_necessary(p=self.p_stall_reset)
+        return super().reset()
+    @override(CartPoleEnv)
+    def step(self, action):
+        # Increase timestep counter for the ongoing episode.
+        self.timesteps += 1
+        # Should we crash?
+        if self._should_crash(p=self.p_crash):
+            raise EnvError(
+                f"Simulated env crash on worker={self.config.worker_index} "
+                f"env-idx={self.config.vector_index} during `step()`! "
+                "Feel free to use any other exception type here instead."
+            )
+        # Should we stall for a while?
+        self._stall_if_necessary(p=self.p_stall)
+        return super().step(action)
+    def _should_crash(self, p):
+        rnd = self._rng.rand()
+        if rnd < p:
+            print("Crashing due to p(crash)!")
+            return True
+        elif self.crash_after_n_steps is not None:
+            if self._crash_after_n_steps is None:
+                self._crash_after_n_steps = (
+                    self.crash_after_n_steps
+                    if not isinstance(self.crash_after_n_steps, tuple)
+                    else np.random.randint(
+                        self.crash_after_n_steps[0], self.crash_after_n_steps[1]
+                    )
+                )
+            if self._crash_after_n_steps == self.timesteps:
+                print("Crashing due to n timesteps reached!")
+                return True
+        return False
+    def _stall_if_necessary(self, p):
+        stall = False
+        if self._rng.rand() < p:
+            stall = True
+        elif self.stall_after_n_steps is not None:
+            if self._stall_after_n_steps is None:
+                self._stall_after_n_steps = (
+                    self.stall_after_n_steps
+                    if not isinstance(self.stall_after_n_steps, tuple)
+                    else np.random.randint(
+                        self.stall_after_n_steps[0], self.stall_after_n_steps[1]
+                    )
+                )
+            if self._stall_after_n_steps == self.timesteps:
+                stall = True
+        if stall:
+            sec = (
+                self.stall_time_sec
+                if not isinstance(self.stall_time_sec, tuple)
+                else np.random.uniform(self.stall_time_sec[0], self.stall_time_sec[1])
+            )
+            print(f" -> will stall for {sec}sec ...")
+            time.sleep(sec)
+MultiAgentCartPoleCrashing = make_multi_agent(lambda config: CartPoleCrashing(config))

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_sparse_rewards.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from copy import deepcopy
+import gymnasium as gym
+import numpy as np
+from gymnasium.spaces import Box, Dict, Discrete
+class CartPoleSparseRewards(gym.Env):
+    """Wrapper for gym CartPole environment where reward is accumulated to the end."""
+    def __init__(self, config=None):
+        self.env = gym.make("CartPole-v1")
+        self.action_space = Discrete(2)
+        self.observation_space = Dict(
+            {
+                "obs": self.env.observation_space,
+                "action_mask": Box(
+                    low=0, high=1, shape=(self.action_space.n,), dtype=np.int8
+                ),
+            }
+        )
+        self.running_reward = 0
+    def reset(self, *, seed=None, options=None):
+        self.running_reward = 0
+        obs, infos = self.env.reset()
+        return {
+            "obs": obs,
+            "action_mask": np.array([1, 1], dtype=np.int8),
+        }, infos
+    def step(self, action):
+        obs, rew, terminated, truncated, info = self.env.step(action)
+        self.running_reward += rew
+        score = self.running_reward if terminated else 0
+        return (
+            {"obs": obs, "action_mask": np.array([1, 1], dtype=np.int8)},
+            score,
+            terminated,
+            truncated,
+            info,
+        )
+    def set_state(self, state):
+        self.running_reward = state[1]
+        self.env = deepcopy(state[0])
+        obs = np.array(list(self.env.unwrapped.state))
+        return {"obs": obs, "action_mask": np.array([1, 1], dtype=np.int8)}
+    def get_state(self):
+        return deepcopy(self.env), self.running_reward

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_dict_observation_space.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import gymnasium as gym
+from gymnasium.envs.classic_control import CartPoleEnv
+import numpy as np
+class CartPoleWithDictObservationSpace(CartPoleEnv):
+    """CartPole gym environment that has a dict observation space.
+    However, otherwise, the information content in each observation remains the same.
+    https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/classic_control/cartpole.py  # noqa
+    The new observation space looks as follows (a little quirky, but this is
+    for testing purposes only):
+    gym.spaces.Dict({
+        "x-pos": [x-pos],
+        "angular-pos": gym.spaces.Dict({"test": [angular-pos]}),
+        "velocs": gym.spaces.Tuple([x-veloc, angular-veloc]),
+    })
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        # Fix our observation-space as described above.
+        low = self.observation_space.low
+        high = self.observation_space.high
+        # Test as many quirks and oddities as possible: Dict, Dict inside a Dict,
+        # Tuple inside a Dict, and both (1,)-shapes as well as ()-shapes for Boxes.
+        # Also add a random discrete variable here.
+        self.observation_space = gym.spaces.Dict(
+            {
+                "x-pos": gym.spaces.Box(low[0], high[0], (1,), dtype=np.float32),
+                "angular-pos": gym.spaces.Dict(
+                    {
+                        "value": gym.spaces.Box(low[2], high[2], (), dtype=np.float32),
+                        # Add some random non-essential information.
+                        "some_random_stuff": gym.spaces.Discrete(3),
+                    }
+                ),
+                "velocs": gym.spaces.Tuple(
+                    [
+                        # x-veloc
+                        gym.spaces.Box(low[1], high[1], (1,), dtype=np.float32),
+                        # angular-veloc
+                        gym.spaces.Box(low[3], high[3], (), dtype=np.float32),
+                    ]
+                ),
+            }
+        )
+    def step(self, action):
+        next_obs, reward, done, truncated, info = super().step(action)
+        return self._compile_current_obs(next_obs), reward, done, truncated, info
+    def reset(self, *, seed=None, options=None):
+        init_obs, init_info = super().reset(seed=seed, options=options)
+        return self._compile_current_obs(init_obs), init_info
+    def _compile_current_obs(self, original_cartpole_obs):
+        # original_cartpole_obs is [x-pos, x-veloc, angle, angle-veloc]
+        return {
+            "x-pos": np.array([original_cartpole_obs[0]], np.float32),
+            "angular-pos": {
+                "value": original_cartpole_obs[2],
+                "some_random_stuff": np.random.randint(3),
+            },
+            "velocs": (
+                np.array([original_cartpole_obs[1]], np.float32),
+                np.array(original_cartpole_obs[3], np.float32),
+            ),
+        }

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_large_observation_space.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import gymnasium as gym
+from gymnasium.envs.classic_control import CartPoleEnv
+import numpy as np
+class CartPoleWithLargeObservationSpace(CartPoleEnv):
+    """CartPole gym environment that has a large dict observation space.
+    However, otherwise, the information content in each observation remains the same.
+    https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/classic_control/cartpole.py  # noqa
+    The new observation space looks as follows (a little quirky, but this is
+    for testing purposes only):
+    gym.spaces.Dict({
+        "1": gym.spaces.Tuple((
+            gym.spaces.Discrete(100),
+            gym.spaces.Box(0, 256, shape=(30,), dtype=float32),
+        )),
+        "2": gym.spaces.Tuple((
+            gym.spaces.Discrete(100),
+            gym.spaces.Box(0, 256, shape=(30,), dtype=float32),
+        )),
+        "3": ...
+        "actual-obs": gym.spaces.Box(-inf, inf, (4,), float32),
+    })
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        # Fix our observation-space as described above.
+        low = self.observation_space.low
+        high = self.observation_space.high
+        # Test as many quirks and oddities as possible: Dict, Dict inside a Dict,
+        # Tuple inside a Dict, and both (1,)-shapes as well as ()-shapes for Boxes.
+        # Also add a random discrete variable here.
+        spaces = {
+            str(i): gym.spaces.Tuple(
+                (
+                    gym.spaces.Discrete(100),
+                    gym.spaces.Box(0, 256, shape=(30,), dtype=np.float32),
+                )
+            )
+            for i in range(100)
+        }
+        spaces.update(
+            {
+                "actually-useful-stuff": (
+                    gym.spaces.Box(low[0], high[0], (4,), np.float32)
+                )
+            }
+        )
+        self.observation_space = gym.spaces.Dict(spaces)
+    def step(self, action):
+        next_obs, reward, done, truncated, info = super().step(action)
+        return self._compile_current_obs(next_obs), reward, done, truncated, info
+    def reset(self, *, seed=None, options=None):
+        init_obs, init_info = super().reset(seed=seed, options=options)
+        return self._compile_current_obs(init_obs), init_info
+    def _compile_current_obs(self, original_cartpole_obs):
+        return {
+            str(i): self.observation_space.spaces[str(i)].sample() for i in range(100)
+        } | {"actually-useful-stuff": original_cartpole_obs}

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_protobuf_observation_space.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import gymnasium as gym
+from gymnasium.envs.classic_control import CartPoleEnv
+import numpy as np
+from ray.rllib.examples.envs.classes.utils.cartpole_observations_proto import (
+    CartPoleObservation,
+)
+class CartPoleWithProtobufObservationSpace(CartPoleEnv):
+    """CartPole gym environment that has a protobuf observation space.
+    Sometimes, it is more performant for an environment to publish its observations
+    as a protobuf message (instead of a heavily nested Dict).
+    The protobuf message used here is originally defined in the
+    `./utils/cartpole_observations.proto` file. We converted this file into a python
+    importable module by compiling it with:
+    `protoc --python_out=. cartpole_observations.proto`
+    .. which yielded the `cartpole_observations_proto.py` file in the same directory
+    (we import this file's `CartPoleObservation` message here).
+    The new observation space is a (binary) Box(0, 255, ([len of protobuf],), uint8).
+    A ConnectorV2 pipeline or simpler gym.Wrapper will have to be used to convert this
+    observation format into an NN-readable (e.g. float32) 1D tensor.
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        dummy_obs = self._convert_observation_to_protobuf(
+            np.array([1.0, 1.0, 1.0, 1.0])
+        )
+        bin_length = len(dummy_obs)
+        self.observation_space = gym.spaces.Box(0, 255, (bin_length,), np.uint8)
+    def step(self, action):
+        observation, reward, terminated, truncated, info = super().step(action)
+        proto_observation = self._convert_observation_to_protobuf(observation)
+        return proto_observation, reward, terminated, truncated, info
+    def reset(self, **kwargs):
+        observation, info = super().reset(**kwargs)
+        proto_observation = self._convert_observation_to_protobuf(observation)
+        return proto_observation, info
+    def _convert_observation_to_protobuf(self, observation):
+        x_pos, x_veloc, angle_pos, angle_veloc = observation
+        # Create the Protobuf message
+        cartpole_observation = CartPoleObservation()
+        cartpole_observation.x_pos = x_pos
+        cartpole_observation.x_veloc = x_veloc
+        cartpole_observation.angle_pos = angle_pos
+        cartpole_observation.angle_veloc = angle_veloc
+        # Serialize to binary string.
+        return np.frombuffer(cartpole_observation.SerializeToString(), np.uint8)
+if __name__ == "__main__":
+    env = CartPoleWithProtobufObservationSpace()
+    obs, info = env.reset()
+    # Test loading a protobuf object with data from the obs binary string
+    # (uint8 ndarray).
+    byte_str = obs.tobytes()
+    obs_protobuf = CartPoleObservation()
+    obs_protobuf.ParseFromString(byte_str)
+    print(obs_protobuf)
+    terminated = truncated = False
+    while not terminated and not truncated:
+        action = env.action_space.sample()
+        obs, reward, terminated, truncated, info = env.step(action)
+        print(obs)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cliff_walking_wall_env.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gymnasium as gym
+from gymnasium import spaces
+ACTION_UP = 0
+ACTION_RIGHT = 1
+ACTION_DOWN = 2
+ACTION_LEFT = 3
+class CliffWalkingWallEnv(gym.Env):
+    """Modified version of the CliffWalking environment from Farama-Foundation's
+    Gymnasium with walls instead of a cliff.
+    ### Description
+    The board is a 4x12 matrix, with (using NumPy matrix indexing):
+    - [3, 0] or obs==36 as the start at bottom-left
+    - [3, 11] or obs==47 as the goal at bottom-right
+    - [3, 1..10] or obs==37...46 as the cliff at bottom-center
+    An episode terminates when the agent reaches the goal.
+    ### Actions
+    There are 4 discrete deterministic actions:
+    - 0: move up
+    - 1: move right
+    - 2: move down
+    - 3: move left
+    You can also use the constants ACTION_UP, ACTION_RIGHT, ... defined above.
+    ### Observations
+    There are 3x12 + 2 possible states, not including the walls. If an action
+    would move an agent into one of the walls, it simply stays in the same position.
+    ### Reward
+    Each time step incurs -1 reward, except reaching the goal which gives +10 reward.
+    """
+    def __init__(self, seed=42) -> None:
+        self.observation_space = spaces.Discrete(48)
+        self.action_space = spaces.Discrete(4)
+        self.observation_space.seed(seed)
+        self.action_space.seed(seed)
+    def reset(self, *, seed=None, options=None):
+        self.position = 36
+        return self.position, {}
+    def step(self, action):
+        x = self.position // 12
+        y = self.position % 12
+        # UP
+        if action == ACTION_UP:
+            x = max(x - 1, 0)
+        # RIGHT
+        elif action == ACTION_RIGHT:
+            if self.position != 36:
+                y = min(y + 1, 11)
+        # DOWN
+        elif action == ACTION_DOWN:
+            if self.position < 25 or self.position > 34:
+                x = min(x + 1, 3)
+        # LEFT
+        elif action == ACTION_LEFT:
+            if self.position != 47:
+                y = max(y - 1, 0)
+        else:
+            raise ValueError(f"action {action} not in {self.action_space}")
+        self.position = x * 12 + y
+        done = self.position == 47
+        reward = -1 if not done else 10
+        return self.position, reward, done, False, {}

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/correlated_actions_env.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from typing import Any, Dict, Optional
+import gymnasium as gym
+import numpy as np
+class CorrelatedActionsEnv(gym.Env):
+    """Environment that can only be solved through an autoregressive action model.
+    In each step, the agent observes a random number (between -1 and 1) and has
+    to choose two actions, a1 (discrete, 0, 1, or 2) and a2 (cont. between -1 and 1).
+    The reward is constructed such that actions need to be correlated to succeed. It's
+    impossible for the network to learn each action head separately.
+    There are two reward components:
+    The first is the negative absolute value of the delta between 1.0 and the sum of
+    obs + a1. For example, if obs is -0.3 and a1 was sampled to be 1, then the value of
+    the first reward component is:
+    r1 = -abs(1.0 - [obs+a1]) = -abs(1.0 - (-0.3 + 1)) = -abs(0.3) = -0.3
+    The second reward component is computed as the negative absolute value
+    of `obs + a1 + a2`. For example, if obs is 0.5, a1 was sampled to be 0,
+    and a2 was sampled to be -0.7, then the value of the second reward component is:
+    r2 = -abs(obs + a1 + a2) = -abs(0.5 + 0 - 0.7)) = -abs(-0.2) = -0.2
+    Because of this specific reward function, the agent must learn to optimally sample
+    a1 based on the observation and to optimally sample a2, based on the observation
+    AND the sampled value of a1.
+    One way to effectively learn this is through correlated action
+    distributions, e.g., in examples/actions/auto_regressive_actions.py
+    The game ends after the first step.
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        # Observation space (single continuous value between -1. and 1.).
+        self.observation_space = gym.spaces.Box(-1.0, 1.0, shape=(1,), dtype=np.float32)
+        # Action space (discrete action a1 and continuous action a2).
+        self.action_space = gym.spaces.Tuple(
+            [gym.spaces.Discrete(3), gym.spaces.Box(-2.0, 2.0, (1,), np.float32)]
+        )
+        # Internal state for the environment (e.g., could represent a factor
+        # influencing the relationship)
+        self.obs = None
+    def reset(
+        self, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None
+    ):
+        """Reset the environment to an initial state."""
+        super().reset(seed=seed, options=options)
+        # Randomly initialize the observation between -1 and 1.
+        self.obs = np.random.uniform(-1, 1, size=(1,))
+        return self.obs, {}
+    def step(self, action):
+        """Apply the autoregressive action and return step information."""
+        # Extract individual action components, a1 and a2.
+        a1, a2 = action
+        a2 = a2[0]  # dissolve shape=(1,)
+        # r1 depends on how well a1 is aligned to obs:
+        r1 = -abs(1.0 - (self.obs[0] + a1))
+        # r2 depends on how well a2 is aligned to both, obs and a1.
+        r2 = -abs(self.obs[0] + a1 + a2)
+        reward = r1 + r2
+        # Optionally: add some noise or complexity to the reward function
+        # reward += np.random.normal(0, 0.01)  # Small noise can be added
+        # Terminate after each step (no episode length in this simple example)
+        return self.obs, reward, True, False, {}

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/d4rl_env.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+8 Environments from D4RL Environment.
+Use fully qualified class-path in your configs:
+e.g. "env": "ray.rllib.examples.envs.classes.d4rl_env.halfcheetah_random".
+"""
+import gymnasium as gym
+try:
+    import d4rl
+    d4rl.__name__  # Fool LINTer.
+except ImportError:
+    d4rl = None
+def halfcheetah_random():
+    return gym.make("halfcheetah-random-v0")
+def halfcheetah_medium():
+    return gym.make("halfcheetah-medium-v0")
+def halfcheetah_expert():
+    return gym.make("halfcheetah-expert-v0")
+def halfcheetah_medium_replay():
+    return gym.make("halfcheetah-medium-replay-v0")
+def hopper_random():
+    return gym.make("hopper-random-v0")
+def hopper_medium():
+    return gym.make("hopper-medium-v0")
+def hopper_expert():
+    return gym.make("hopper-expert-v0")
+def hopper_medium_replay():
+    return gym.make("hopper-medium-replay-v0")

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/debug_counter_env.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import gymnasium as gym
+import numpy as np
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+class DebugCounterEnv(gym.Env):
+    """Simple Env that yields a ts counter as observation (0-based).
+    Actions have no effect.
+    The episode length is always 15.
+    Reward is always: current ts % 3.
+    """
+    def __init__(self, config=None):
+        config = config or {}
+        self.action_space = gym.spaces.Discrete(2)
+        self.observation_space = gym.spaces.Box(0, 100, (1,), dtype=np.float32)
+        self.start_at_t = int(config.get("start_at_t", 0))
+        self.i = self.start_at_t
+    def reset(self, *, seed=None, options=None):
+        self.i = self.start_at_t
+        return self._get_obs(), {}
+    def step(self, action):
+        self.i += 1
+        terminated = False
+        truncated = self.i >= 15 + self.start_at_t
+        return self._get_obs(), float(self.i % 3), terminated, truncated, {}
+    def _get_obs(self):
+        return np.array([self.i], dtype=np.float32)
+class MultiAgentDebugCounterEnv(MultiAgentEnv):
+    def __init__(self, config):
+        super().__init__()
+        self.num_agents = config["num_agents"]
+        self.base_episode_len = config.get("base_episode_len", 103)
+        # Observation dims:
+        # 0=agent ID.
+        # 1=episode ID (0.0 for obs after reset).
+        # 2=env ID (0.0 for obs after reset).
+        # 3=ts (of the agent).
+        self.observation_space = gym.spaces.Dict(
+            {
+                aid: gym.spaces.Box(float("-inf"), float("inf"), (4,))
+                for aid in range(self.num_agents)
+            }
+        )
+        # Actions are always:
+        # (episodeID, envID) as floats.
+        self.action_space = gym.spaces.Dict(
+            {
+                aid: gym.spaces.Box(-float("inf"), float("inf"), shape=(2,))
+                for aid in range(self.num_agents)
+            }
+        )
+        self.timesteps = [0] * self.num_agents
+        self.terminateds = set()
+        self.truncateds = set()
+    def reset(self, *, seed=None, options=None):
+        self.timesteps = [0] * self.num_agents
+        self.terminateds = set()
+        self.truncateds = set()
+        return {
+            i: np.array([i, 0.0, 0.0, 0.0], dtype=np.float32)
+            for i in range(self.num_agents)
+        }, {}
+    def step(self, action_dict):
+        obs, rew, terminated, truncated = {}, {}, {}, {}
+        for i, action in action_dict.items():
+            self.timesteps[i] += 1
+            obs[i] = np.array([i, action[0], action[1], self.timesteps[i]])
+            rew[i] = self.timesteps[i] % 3
+            terminated[i] = False
+            truncated[i] = (
+                True if self.timesteps[i] > self.base_episode_len + i else False
+            )
+            if terminated[i]:
+                self.terminateds.add(i)
+            if truncated[i]:
+                self.truncateds.add(i)
+        terminated["__all__"] = len(self.terminateds) == self.num_agents
+        truncated["__all__"] = len(self.truncateds) == self.num_agents
+        return obs, rew, terminated, truncated, {}

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/deterministic_envs.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import gymnasium as gym
+def create_cartpole_deterministic(config):
+    env = gym.make("CartPole-v1")
+    env.reset(seed=config.get("seed", 0))
+    return env
+def create_pendulum_deterministic(config):
+    env = gym.make("Pendulum-v1")
+    env.reset(seed=config.get("seed", 0))
+    return env

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/dm_control_suite.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv
+"""
+8 Environments from Deepmind Control Suite
+"""
+def acrobot_swingup(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "acrobot",
+        "swingup",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+def walker_walk(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "walker",
+        "walk",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+def hopper_hop(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "hopper",
+        "hop",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+def hopper_stand(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "hopper",
+        "stand",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+def cheetah_run(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "cheetah",
+        "run",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+def walker_run(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "walker",
+        "run",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+def pendulum_swingup(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "pendulum",
+        "swingup",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+def cartpole_swingup(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "cartpole",
+        "swingup",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )
+def humanoid_walk(
+    from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True
+):
+    return DMCEnv(
+        "humanoid",
+        "walk",
+        from_pixels=from_pixels,
+        height=height,
+        width=width,
+        frame_skip=frame_skip,
+        channels_first=channels_first,
+    )

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_using_remote_actor.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Example of an environment that uses a named remote actor as parameter
+server.
+"""
+from gymnasium.envs.classic_control.cartpole import CartPoleEnv
+from gymnasium.utils import seeding
+import ray
+@ray.remote
+class ParameterStorage:
+    def get_params(self, rng):
+        return {
+            "MASSCART": rng.uniform(low=0.5, high=2.0),
+        }
+class CartPoleWithRemoteParamServer(CartPoleEnv):
+    """CartPoleMassEnv varies the weights of the cart and the pole."""
+    def __init__(self, env_config):
+        self.env_config = env_config
+        super().__init__()
+        # Get our param server (remote actor) by name.
+        self._handler = ray.get_actor(env_config.get("param_server", "param-server"))
+        self.rng_seed = None
+        self.np_random, _ = seeding.np_random(self.rng_seed)
+    def reset(self, *, seed=None, options=None):
+        if seed is not None:
+            self.rng_seed = int(seed)
+            self.np_random, _ = seeding.np_random(seed)
+            print(
+                f"Seeding env (worker={self.env_config.worker_index}) " f"with {seed}"
+            )
+        # Pass in our RNG to guarantee no race conditions.
+        # If `self._handler` had its own RNG, this may clash with other
+        # envs trying to use the same param-server.
+        params = ray.get(self._handler.get_params.remote(self.np_random))
+        # IMPORTANT: Advance the state of our RNG (self._rng was passed
+        # above via ray (serialized) and thus not altered locally here!).
+        # Or create a new RNG from another random number:
+        # Seed the RNG with a deterministic seed if set, otherwise, create
+        # a random one.
+        new_seed = int(
+            self.np_random.integers(0, 1000000) if not self.rng_seed else self.rng_seed
+        )
+        self.np_random, _ = seeding.np_random(new_seed)
+        print(
+            f"Env worker-idx={self.env_config.worker_index} "
+            f"mass={params['MASSCART']}"
+        )
+        self.masscart = params["MASSCART"]
+        self.total_mass = self.masspole + self.masscart
+        self.polemass_length = self.masspole * self.length
+        return super().reset()

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_with_subprocess.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import atexit
+import gymnasium as gym
+from gymnasium.spaces import Discrete
+import os
+import subprocess
+class EnvWithSubprocess(gym.Env):
+    """An env that spawns a subprocess."""
+    # Dummy command to run as a subprocess with a unique name
+    UNIQUE_CMD = "sleep 20"
+    def __init__(self, config):
+        self.UNIQUE_FILE_0 = config["tmp_file1"]
+        self.UNIQUE_FILE_1 = config["tmp_file2"]
+        self.UNIQUE_FILE_2 = config["tmp_file3"]
+        self.UNIQUE_FILE_3 = config["tmp_file4"]
+        self.action_space = Discrete(2)
+        self.observation_space = Discrete(2)
+        # Subprocess that should be cleaned up.
+        self.subproc = subprocess.Popen(self.UNIQUE_CMD.split(" "), shell=False)
+        self.config = config
+        # Exit handler should be called.
+        atexit.register(lambda: self.subproc.kill())
+        if config.worker_index == 0:
+            atexit.register(lambda: os.unlink(self.UNIQUE_FILE_0))
+        else:
+            atexit.register(lambda: os.unlink(self.UNIQUE_FILE_1))
+    def close(self):
+        if self.config.worker_index == 0:
+            os.unlink(self.UNIQUE_FILE_2)
+        else:
+            os.unlink(self.UNIQUE_FILE_3)
+    def reset(self, *, seed=None, options=None):
+        return 0, {}
+    def step(self, action):
+        return 0, 0, True, False, {}

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/fast_image_env.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+class FastImageEnv(gym.Env):
+    def __init__(self, config):
+        self.zeros = np.zeros((84, 84, 4))
+        self.action_space = Discrete(2)
+        self.observation_space = Box(0.0, 1.0, shape=(84, 84, 4), dtype=np.float32)
+        self.i = 0
+    def reset(self, *, seed=None, options=None):
+        self.i = 0
+        return self.zeros, {}
+    def step(self, action):
+        self.i += 1
+        done = truncated = self.i > 1000
+        return self.zeros, 1, done, truncated, {}

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/gpu_requiring_env.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+import ray
+from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor
+from ray.rllib.utils.framework import try_import_torch
+torch, _ = try_import_torch()
+class GPURequiringEnv(SimpleCorridor):
+    """A dummy env that requires a GPU in order to work.
+    The env here is a simple corridor env that additionally simulates a GPU
+    check in its constructor via `ray.get_gpu_ids()`. If this returns an
+    empty list, we raise an error.
+    To make this env work, use `num_gpus_per_env_runner > 0` (RolloutWorkers
+    requesting this many GPUs each) and - maybe - `num_gpus > 0` in case
+    your local worker/driver must have an env as well. However, this is
+    only the case if `create_env_on_driver`=True (default is False).
+    """
+    def __init__(self, config=None):
+        super().__init__(config)
+        # Fake-require some GPUs (at least one).
+        # If your local worker's env (`create_env_on_driver`=True) does not
+        # necessarily require a GPU, you can perform the below assertion only
+        # if `config.worker_index != 0`.
+        gpus_available = ray.get_gpu_ids()
+        print(f"{type(self).__name__} can see GPUs={gpus_available}")
+        # Create a dummy tensor on the GPU.
+        if len(gpus_available) > 0 and torch:
+            self._tensor = torch.from_numpy(np.random.random_sample(size=(42, 42))).to(
+                f"cuda:{gpus_available[0]}"
+            )

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/look_and_push.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import gymnasium as gym
+import numpy as np
+class LookAndPush(gym.Env):
+    """Memory-requiring Env: Best sequence of actions depends on prev. states.
+    Optimal behavior:
+        0) a=0 -> observe next state (s'), which is the "hidden" state.
+            If a=1 here, the hidden state is not observed.
+        1) a=1 to always jump to s=2 (not matter what the prev. state was).
+        2) a=1 to move to s=3.
+        3) a=1 to move to s=4.
+        4) a=0 OR 1 depending on s' observed after 0): +10 reward and done.
+            otherwise: -10 reward and done.
+    """
+    def __init__(self):
+        self.action_space = gym.spaces.Discrete(2)
+        self.observation_space = gym.spaces.Discrete(5)
+        self._state = None
+        self._case = None
+    def reset(self, *, seed=None, options=None):
+        self._state = 2
+        self._case = np.random.choice(2)
+        return self._state, {}
+    def step(self, action):
+        assert self.action_space.contains(action)
+        if self._state == 4:
+            if action and self._case:
+                return self._state, 10.0, True, {}
+            else:
+                return self._state, -10, True, {}
+        else:
+            if action:
+                if self._state == 0:
+                    self._state = 2
+                else:
+                    self._state += 1
+            elif self._state == 2:
+                self._state = self._case
+        return self._state, -1, False, False, {}
+class OneHot(gym.Wrapper):
+    def __init__(self, env):
+        super(OneHot, self).__init__(env)
+        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n,))
+    def reset(self, *, seed=None, options=None):
+        obs, info = self.env.reset(seed=seed, options=options)
+        return self._encode_obs(obs), info
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        return self._encode_obs(obs), reward, terminated, truncated, info
+    def _encode_obs(self, obs):
+        new_obs = np.ones(self.env.observation_space.n)
+        new_obs[obs] = 1.0
+        return new_obs

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/memory_leaking_env.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import logging
+import uuid
+from ray.rllib.examples.envs.classes.random_env import RandomEnv
+from ray.rllib.utils.annotations import override
+logger = logging.getLogger(__name__)
+class MemoryLeakingEnv(RandomEnv):
+    """An env that leaks very little memory.
+    Useful for proving that our memory-leak tests can catch the
+    slightest leaks.
+    """
+    def __init__(self, config=None):
+        super().__init__(config)
+        self._leak = {}
+        self._steps_after_reset = 0
+    @override(RandomEnv)
+    def reset(self, *, seed=None, options=None):
+        self._steps_after_reset = 0
+        return super().reset(seed=seed, options=options)
+    @override(RandomEnv)
+    def step(self, action):
+        self._steps_after_reset += 1
+        # Only leak once an episode.
+        if self._steps_after_reset == 2:
+            self._leak[uuid.uuid4().hex.upper()] = 1
+        return super().step(action)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/mock_env.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import gymnasium as gym
+import numpy as np
+from typing import Optional
+from ray.rllib.env.vector_env import VectorEnv
+from ray.rllib.utils.annotations import override
+class MockEnv(gym.Env):
+    """Mock environment for testing purposes.
+    Observation=0, reward=1.0, episode-len is configurable.
+    Actions are ignored.
+    """
+    def __init__(self, episode_length, config=None):
+        self.episode_length = episode_length
+        self.config = config
+        self.i = 0
+        self.observation_space = gym.spaces.Discrete(1)
+        self.action_space = gym.spaces.Discrete(2)
+    def reset(self, *, seed=None, options=None):
+        self.i = 0
+        return 0, {}
+    def step(self, action):
+        self.i += 1
+        terminated = truncated = self.i >= self.episode_length
+        return 0, 1.0, terminated, truncated, {}
+class MockEnv2(gym.Env):
+    """Mock environment for testing purposes.
+    Observation=ts (discrete space!), reward=100.0, episode-len is
+    configurable. Actions are ignored.
+    """
+    metadata = {
+        "render.modes": ["rgb_array"],
+    }
+    render_mode: Optional[str] = "rgb_array"
+    def __init__(self, episode_length):
+        self.episode_length = episode_length
+        self.i = 0
+        self.observation_space = gym.spaces.Discrete(self.episode_length + 1)
+        self.action_space = gym.spaces.Discrete(2)
+        self.rng_seed = None
+    def reset(self, *, seed=None, options=None):
+        self.i = 0
+        if seed is not None:
+            self.rng_seed = seed
+        return self.i, {}
+    def step(self, action):
+        self.i += 1
+        terminated = truncated = self.i >= self.episode_length
+        return self.i, 100.0, terminated, truncated, {}
+    def render(self):
+        # Just generate a random image here for demonstration purposes.
+        # Also see `gym/envs/classic_control/cartpole.py` for
+        # an example on how to use a Viewer object.
+        return np.random.randint(0, 256, size=(300, 400, 3), dtype=np.uint8)
+class MockEnv3(gym.Env):
+    """Mock environment for testing purposes.
+    Observation=ts (discrete space!), reward=100.0, episode-len is
+    configurable. Actions are ignored.
+    """
+    def __init__(self, episode_length):
+        self.episode_length = episode_length
+        self.i = 0
+        self.observation_space = gym.spaces.Discrete(100)
+        self.action_space = gym.spaces.Discrete(2)
+    def reset(self, *, seed=None, options=None):
+        self.i = 0
+        return self.i, {"timestep": 0}
+    def step(self, action):
+        self.i += 1
+        terminated = truncated = self.i >= self.episode_length
+        return self.i, self.i, terminated, truncated, {"timestep": self.i}
+class VectorizedMockEnv(VectorEnv):
+    """Vectorized version of the MockEnv.
+    Contains `num_envs` MockEnv instances, each one having its own
+    `episode_length` horizon.
+    """
+    def __init__(self, episode_length, num_envs):
+        super().__init__(
+            observation_space=gym.spaces.Discrete(1),
+            action_space=gym.spaces.Discrete(2),
+            num_envs=num_envs,
+        )
+        self.envs = [MockEnv(episode_length) for _ in range(num_envs)]
+    @override(VectorEnv)
+    def vector_reset(self, *, seeds=None, options=None):
+        seeds = seeds or [None] * self.num_envs
+        options = options or [None] * self.num_envs
+        obs_and_infos = [
+            e.reset(seed=seeds[i], options=options[i]) for i, e in enumerate(self.envs)
+        ]
+        return [oi[0] for oi in obs_and_infos], [oi[1] for oi in obs_and_infos]
+    @override(VectorEnv)
+    def reset_at(self, index, *, seed=None, options=None):
+        return self.envs[index].reset(seed=seed, options=options)
+    @override(VectorEnv)
+    def vector_step(self, actions):
+        obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch = (
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+        for i in range(len(self.envs)):
+            obs, rew, terminated, truncated, info = self.envs[i].step(actions[i])
+            obs_batch.append(obs)
+            rew_batch.append(rew)
+            terminated_batch.append(terminated)
+            truncated_batch.append(truncated)
+            info_batch.append(info)
+        return obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch
+    @override(VectorEnv)
+    def get_sub_environments(self):
+        return self.envs
+class MockVectorEnv(VectorEnv):
+    """A custom vector env that uses a single(!) CartPole sub-env.
+    However, this env pretends to be a vectorized one to illustrate how one
+    could create custom VectorEnvs w/o the need for actual vectorizations of
+    sub-envs under the hood.
+    """
+    def __init__(self, episode_length, mocked_num_envs):
+        self.env = gym.make("CartPole-v1")
+        super().__init__(
+            observation_space=self.env.observation_space,
+            action_space=self.env.action_space,
+            num_envs=mocked_num_envs,
+        )
+        self.episode_len = episode_length
+        self.ts = 0
+    @override(VectorEnv)
+    def vector_reset(self, *, seeds=None, options=None):
+        # Since we only have one underlying sub-environment, just use the first seed
+        # and the first options dict (the user of this env thinks, there are
+        # `self.num_envs` sub-environments and sends that many seeds/options).
+        seeds = seeds or [None]
+        options = options or [None]
+        obs, infos = self.env.reset(seed=seeds[0], options=options[0])
+        # Simply repeat the single obs/infos to pretend we really have
+        # `self.num_envs` sub-environments.
+        return (
+            [obs for _ in range(self.num_envs)],
+            [infos for _ in range(self.num_envs)],
+        )
+    @override(VectorEnv)
+    def reset_at(self, index, *, seed=None, options=None):
+        self.ts = 0
+        return self.env.reset(seed=seed, options=options)
+    @override(VectorEnv)
+    def vector_step(self, actions):
+        self.ts += 1
+        # Apply all actions sequentially to the same env.
+        # Whether this would make a lot of sense is debatable.
+        obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch = (
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+        for i in range(self.num_envs):
+            obs, rew, terminated, truncated, info = self.env.step(actions[i])
+            # Artificially truncate once time step limit has been reached.
+            # Note: Also terminate/truncate, when underlying CartPole is
+            # terminated/truncated.
+            if self.ts >= self.episode_len:
+                truncated = True
+            obs_batch.append(obs)
+            rew_batch.append(rew)
+            terminated_batch.append(terminated)
+            truncated_batch.append(truncated)
+            info_batch.append(info)
+            if terminated or truncated:
+                remaining = self.num_envs - (i + 1)
+                obs_batch.extend([obs for _ in range(remaining)])
+                rew_batch.extend([rew for _ in range(remaining)])
+                terminated_batch.extend([terminated for _ in range(remaining)])
+                truncated_batch.extend([truncated for _ in range(remaining)])
+                info_batch.extend([info for _ in range(remaining)])
+                break
+        return obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch
+    @override(VectorEnv)
+    def get_sub_environments(self):
+        # You may also leave this method as-is, in which case, it would
+        # return an empty list.
+        return [self.env for _ in range(self.num_envs)]

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_discrete.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import copy
+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+import random
+class SimpleContextualBandit(gym.Env):
+    """Simple env w/ 2 states and 3 actions (arms): 0, 1, and 2.
+    Episodes last only for one timestep, possible observations are:
+    [-1.0, 1.0] and [1.0, -1.0], where the first element is the "current context".
+    The highest reward (+10.0) is received for selecting arm 0 for context=1.0
+    and arm 2 for context=-1.0. Action 1 always yields 0.0 reward.
+    """
+    def __init__(self, config=None):
+        self.action_space = Discrete(3)
+        self.observation_space = Box(low=-1.0, high=1.0, shape=(2,))
+        self.cur_context = None
+    def reset(self, *, seed=None, options=None):
+        self.cur_context = random.choice([-1.0, 1.0])
+        return np.array([self.cur_context, -self.cur_context]), {}
+    def step(self, action):
+        rewards_for_context = {
+            -1.0: [-10, 0, 10],
+            1.0: [10, 0, -10],
+        }
+        reward = rewards_for_context[self.cur_context][action]
+        return (
+            np.array([-self.cur_context, self.cur_context]),
+            reward,
+            True,
+            False,
+            {"regret": 10 - reward},
+        )
+class LinearDiscreteEnv(gym.Env):
+    """Samples data from linearly parameterized arms.
+    The reward for context X and arm i is given by X^T * theta_i, for some
+    latent set of parameters {theta_i : i = 1, ..., k}.
+    The thetas are sampled uniformly at random, the contexts are Gaussian,
+    and Gaussian noise is added to the rewards.
+    """
+    DEFAULT_CONFIG_LINEAR = {
+        "feature_dim": 8,
+        "num_actions": 4,
+        "reward_noise_std": 0.01,
+    }
+    def __init__(self, config=None):
+        self.config = copy.copy(self.DEFAULT_CONFIG_LINEAR)
+        if config is not None and type(config) is dict:
+            self.config.update(config)
+        self.feature_dim = self.config["feature_dim"]
+        self.num_actions = self.config["num_actions"]
+        self.sigma = self.config["reward_noise_std"]
+        self.action_space = Discrete(self.num_actions)
+        self.observation_space = Box(low=-10, high=10, shape=(self.feature_dim,))
+        self.thetas = np.random.uniform(-1, 1, (self.num_actions, self.feature_dim))
+        self.thetas /= np.linalg.norm(self.thetas, axis=1, keepdims=True)
+        self._elapsed_steps = 0
+        self._current_context = None
+    def _sample_context(self):
+        return np.random.normal(scale=1 / 3, size=(self.feature_dim,))
+    def reset(self, *, seed=None, options=None):
+        self._current_context = self._sample_context()
+        return self._current_context, {}
+    def step(self, action):
+        assert (
+            self._elapsed_steps is not None
+        ), "Cannot call env.step() beforecalling reset()"
+        assert action < self.num_actions, "Invalid action."
+        action = int(action)
+        context = self._current_context
+        rewards = self.thetas.dot(context)
+        opt_action = rewards.argmax()
+        regret = rewards.max() - rewards[action]
+        # Add Gaussian noise
+        rewards += np.random.normal(scale=self.sigma, size=rewards.shape)
+        reward = rewards[action]
+        self._current_context = self._sample_context()
+        return (
+            self._current_context,
+            reward,
+            True,
+            False,
+            {"regret": regret, "opt_action": opt_action},
+        )
+    def render(self, mode="human"):
+        raise NotImplementedError
+class WheelBanditEnv(gym.Env):
+    """Wheel bandit environment for 2D contexts
+    (see https://arxiv.org/abs/1802.09127).
+    """
+    DEFAULT_CONFIG_WHEEL = {
+        "delta": 0.5,
+        "mu_1": 1.2,
+        "mu_2": 1,
+        "mu_3": 50,
+        "std": 0.01,
+    }
+    feature_dim = 2
+    num_actions = 5
+    def __init__(self, config=None):
+        self.config = copy.copy(self.DEFAULT_CONFIG_WHEEL)
+        if config is not None and type(config) is dict:
+            self.config.update(config)
+        self.delta = self.config["delta"]
+        self.mu_1 = self.config["mu_1"]
+        self.mu_2 = self.config["mu_2"]
+        self.mu_3 = self.config["mu_3"]
+        self.std = self.config["std"]
+        self.action_space = Discrete(self.num_actions)
+        self.observation_space = Box(low=-1, high=1, shape=(self.feature_dim,))
+        self.means = [self.mu_1] + 4 * [self.mu_2]
+        self._elapsed_steps = 0
+        self._current_context = None
+    def _sample_context(self):
+        while True:
+            state = np.random.uniform(-1, 1, self.feature_dim)
+            if np.linalg.norm(state) <= 1:
+                return state
+    def reset(self, *, seed=None, options=None):
+        self._current_context = self._sample_context()
+        return self._current_context, {}
+    def step(self, action):
+        assert (
+            self._elapsed_steps is not None
+        ), "Cannot call env.step() before calling reset()"
+        action = int(action)
+        self._elapsed_steps += 1
+        rewards = [
+            np.random.normal(self.means[j], self.std) for j in range(self.num_actions)
+        ]
+        context = self._current_context
+        r_big = np.random.normal(self.mu_3, self.std)
+        if np.linalg.norm(context) >= self.delta:
+            if context[0] > 0:
+                if context[1] > 0:
+                    # First quadrant
+                    rewards[1] = r_big
+                    opt_action = 1
+                else:
+                    # Fourth quadrant
+                    rewards[4] = r_big
+                    opt_action = 4
+            else:
+                if context[1] > 0:
+                    # Second quadrant
+                    rewards[2] = r_big
+                    opt_action = 2
+                else:
+                    # Third quadrant
+                    rewards[3] = r_big
+                    opt_action = 3
+        else:
+            # Smaller region where action 0 is optimal
+            opt_action = 0
+        reward = rewards[action]
+        regret = rewards[opt_action] - reward
+        self._current_context = self._sample_context()
+        return (
+            self._current_context,
+            reward,
+            True,
+            False,
+            {"regret": regret, "opt_action": opt_action},
+        )
+    def render(self, mode="human"):
+        raise NotImplementedError

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/guess_the_number_game.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import gymnasium as gym
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+class GuessTheNumberGame(MultiAgentEnv):
+    """
+    We have two players, 0 and 1. Agent 0 has to pick a number between 0, MAX-1
+    at reset. Agent 1 has to guess the number by asking N questions of whether
+    of the form of "a <number> is higher|lower|equal to the picked number. The
+    action space is MultiDiscrete [3, MAX]. For the first index 0 means lower,
+    1 means higher and 2 means equal. The environment answers with yes (1) or
+    no (0) on the reward function. Every time step that agent 1 wastes agent 0
+    gets a reward of 1. After N steps the game is terminated. If agent 1
+    guesses the number correctly, it gets a reward of 100 points, otherwise it
+    gets a reward of 0. On the other hand if agent 0 wins they win 100 points.
+    The optimal policy controlling agent 1 should converge to a binary search
+    strategy.
+    """
+    MAX_NUMBER = 3
+    MAX_STEPS = 20
+    def __init__(self, config=None):
+        super().__init__()
+        self._agent_ids = {0, 1}
+        self.max_number = config.get("max_number", self.MAX_NUMBER)
+        self.max_steps = config.get("max_steps", self.MAX_STEPS)
+        self._number = None
+        self.observation_space = gym.spaces.Discrete(2)
+        self.action_space = gym.spaces.MultiDiscrete([3, self.max_number])
+    def reset(self, *, seed=None, options=None):
+        self._step = 0
+        self._number = None
+        # agent 0 has to pick a number. So the returned obs does not matter.
+        return {0: 0}, {}
+    def step(self, action_dict):
+        # get agent 0's action
+        agent_0_action = action_dict.get(0)
+        if agent_0_action is not None:
+            # ignore the first part of the action and look at the number
+            self._number = agent_0_action[1]
+            # next obs should tell agent 1 to start guessing.
+            # the returned reward and dones should be on agent 0 who picked a
+            # number.
+            return (
+                {1: 0},
+                {0: 0},
+                {0: False, "__all__": False},
+                {0: False, "__all__": False},
+                {},
+            )
+        if self._number is None:
+            raise ValueError(
+                "No number is selected by agent 0. Have you restarted "
+                "the environment?"
+            )
+        # get agent 1's action
+        direction, number = action_dict.get(1)
+        info = {}
+        # always the same, we don't need agent 0 to act ever again, agent 1 should keep
+        # guessing.
+        obs = {1: 0}
+        guessed_correctly = False
+        terminated = {1: False, "__all__": False}
+        truncated = {1: False, "__all__": False}
+        # everytime agent 1 does not guess correctly agent 0 gets a reward of 1.
+        if direction == 0:  # lower
+            reward = {1: int(number > self._number), 0: 1}
+        elif direction == 1:  # higher
+            reward = {1: int(number < self._number), 0: 1}
+        else:  # equal
+            guessed_correctly = number == self._number
+            reward = {1: guessed_correctly * 100, 0: guessed_correctly * -100}
+            terminated = {1: guessed_correctly, "__all__": guessed_correctly}
+        self._step += 1
+        if self._step >= self.max_steps:  # max number of steps episode is over
+            truncated["__all__"] = True
+            if not guessed_correctly:
+                reward[0] = 100  # agent 0 wins
+        return obs, reward, terminated, truncated, info

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_chess.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from pettingzoo import AECEnv
+from pettingzoo.classic.chess.chess import raw_env as chess_v5
+import copy
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+from typing import Dict, Any
+import chess as ch
+import numpy as np
+class MultiAgentChess(MultiAgentEnv):
+    """An interface to the PettingZoo MARL environment library.
+    See: https://github.com/Farama-Foundation/PettingZoo
+    Inherits from MultiAgentEnv and exposes a given AEC
+    (actor-environment-cycle) game from the PettingZoo project via the
+    MultiAgentEnv public API.
+    Note that the wrapper has some important limitations:
+    1. All agents have the same action_spaces and observation_spaces.
+       Note: If, within your aec game, agents do not have homogeneous action /
+       observation spaces, apply SuperSuit wrappers
+       to apply padding functionality: https://github.com/Farama-Foundation/
+       SuperSuit#built-in-multi-agent-only-functions
+    2. Environments are positive sum games (-> Agents are expected to cooperate
+       to maximize reward). This isn't a hard restriction, it just that
+       standard algorithms aren't expected to work well in highly competitive
+       games.
+    .. testcode::
+        :skipif: True
+        from pettingzoo.butterfly import prison_v3
+        from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
+        env = PettingZooEnv(prison_v3.env())
+        obs = env.reset()
+        print(obs)
+        # only returns the observation for the agent which should be stepping
+    .. testoutput::
+        {
+            'prisoner_0': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+    .. testcode::
+        :skipif: True
+        obs, rewards, dones, infos = env.step({
+                        "prisoner_0": 1
+                    })
+        # only returns the observation, reward, info, etc, for
+        # the agent who's turn is next.
+        print(obs)
+    .. testoutput::
+        {
+            'prisoner_1': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+    .. testcode::
+        :skipif: True
+        print(rewards)
+    .. testoutput::
+        {
+            'prisoner_1': 0
+        }
+    .. testcode::
+        :skipif: True
+        print(dones)
+    .. testoutput::
+        {
+            'prisoner_1': False, '__all__': False
+        }
+    .. testcode::
+        :skipif: True
+        print(infos)
+    .. testoutput::
+        {
+            'prisoner_1': {'map_tuple': (1, 0)}
+        }
+    """
+    def __init__(
+        self,
+        config: Dict[Any, Any] = None,
+        env: AECEnv = None,
+    ):
+        super().__init__()
+        if env is None:
+            self.env = chess_v5()
+        else:
+            self.env = env
+        self.env.reset()
+        self.config = config
+        if self.config is None:
+            self.config = {}
+        try:
+            self.config["random_start"] = self.config["random_start"]
+        except KeyError:
+            self.config["random_start"] = 4
+        # Get first observation space, assuming all agents have equal space
+        self.observation_space = self.env.observation_space(self.env.agents[0])
+        # Get first action space, assuming all agents have equal space
+        self.action_space = self.env.action_space(self.env.agents[0])
+        assert all(
+            self.env.observation_space(agent) == self.observation_space
+            for agent in self.env.agents
+        ), (
+            "Observation spaces for all agents must be identical. Perhaps "
+            "SuperSuit's pad_observations wrapper can help (useage: "
+            "`supersuit.aec_wrappers.pad_observations(env)`"
+        )
+        assert all(
+            self.env.action_space(agent) == self.action_space
+            for agent in self.env.agents
+        ), (
+            "Action spaces for all agents must be identical. Perhaps "
+            "SuperSuit's pad_action_space wrapper can help (usage: "
+            "`supersuit.aec_wrappers.pad_action_space(env)`"
+        )
+        self._agent_ids = set(self.env.agents)
+    def random_start(self, random_moves):
+        self.env.board = ch.Board()
+        for i in range(random_moves):
+            self.env.board.push(np.random.choice(list(self.env.board.legal_moves)))
+        return self.env.board
+    def observe(self):
+        return {
+            self.env.agent_selection: self.env.observe(self.env.agent_selection),
+            "state": self.get_state(),
+        }
+    def reset(self, *args, **kwargs):
+        self.env.reset()
+        if self.config["random_start"] > 0:
+            self.random_start(self.config["random_start"])
+        return (
+            {self.env.agent_selection: self.env.observe(self.env.agent_selection)},
+            {self.env.agent_selection: {}},
+        )
+    def step(self, action):
+        try:
+            self.env.step(action[self.env.agent_selection])
+        except (KeyError, IndexError):
+            self.env.step(action)
+        except AssertionError:
+            # Illegal action
+            print(action)
+            raise AssertionError("Illegal action")
+        obs_d = {}
+        rew_d = {}
+        done_d = {}
+        truncated_d = {}
+        info_d = {}
+        while self.env.agents:
+            obs, rew, done, trunc, info = self.env.last()
+            a = self.env.agent_selection
+            obs_d[a] = obs
+            rew_d[a] = rew
+            done_d[a] = done
+            truncated_d[a] = trunc
+            info_d[a] = info
+            if self.env.terminations[self.env.agent_selection]:
+                self.env.step(None)
+                done_d["__all__"] = True
+                truncated_d["__all__"] = True
+            else:
+                done_d["__all__"] = False
+                truncated_d["__all__"] = False
+                break
+        return obs_d, rew_d, done_d, truncated_d, info_d
+    def close(self):
+        self.env.close()
+    def seed(self, seed=None):
+        self.env.seed(seed)
+    def render(self, mode="human"):
+        return self.env.render(mode)
+    @property
+    def agent_selection(self):
+        return self.env.agent_selection
+    @property
+    def get_sub_environments(self):
+        return self.env.unwrapped
+    def get_state(self):
+        state = copy.deepcopy(self.env)
+        return state
+    def set_state(self, state):
+        self.env = copy.deepcopy(state)
+        return self.env.observe(self.env.agent_selection)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_connect4.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import copy
+from typing import Dict, Any
+from pettingzoo import AECEnv
+from pettingzoo.classic.connect_four_v3 import raw_env as connect_four_v3
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+class MultiAgentConnect4(MultiAgentEnv):
+    """An interface to the PettingZoo MARL environment library.
+    See: https://github.com/Farama-Foundation/PettingZoo
+    Inherits from MultiAgentEnv and exposes a given AEC
+    (actor-environment-cycle) game from the PettingZoo project via the
+    MultiAgentEnv public API.
+    Note that the wrapper has some important limitations:
+    1. All agents have the same action_spaces and observation_spaces.
+       Note: If, within your aec game, agents do not have homogeneous action /
+       observation spaces, apply SuperSuit wrappers
+       to apply padding functionality: https://github.com/Farama-Foundation/
+       SuperSuit#built-in-multi-agent-only-functions
+    2. Environments are positive sum games (-> Agents are expected to cooperate
+       to maximize reward). This isn't a hard restriction, it just that
+       standard algorithms aren't expected to work well in highly competitive
+       games.
+    .. testcode::
+        :skipif: True
+        from pettingzoo.butterfly import prison_v3
+        from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
+        env = PettingZooEnv(prison_v3.env())
+        obs = env.reset()
+        print(obs)
+    .. testoutput::
+        # only returns the observation for the agent which should be stepping
+        {
+            'prisoner_0': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+    .. testcode::
+        :skipif: True
+        obs, rewards, dones, infos = env.step({
+                        "prisoner_0": 1
+                    })
+        # only returns the observation, reward, info, etc, for
+        # the agent who's turn is next.
+        print(obs)
+    .. testoutput::
+        {
+            'prisoner_1': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+    .. testcode::
+        :skipif: True
+        print(rewards)
+    .. testoutput::
+        {
+            'prisoner_1': 0
+        }
+    .. testcode::
+        :skipif: True
+        print(dones)
+    .. testoutput::
+        {
+            'prisoner_1': False, '__all__': False
+        }
+    .. testcode::
+        :skipif: True
+        print(infos)
+    .. testoutput::
+        {
+            'prisoner_1': {'map_tuple': (1, 0)}
+        }
+    """
+    def __init__(
+        self,
+        config: Dict[Any, Any] = None,
+        env: AECEnv = None,
+    ):
+        super().__init__()
+        if env is None:
+            self.env = connect_four_v3()
+        else:
+            self.env = env
+        self.env.reset()
+        self.config = config
+        # Get first observation space, assuming all agents have equal space
+        self.observation_space = self.env.observation_space(self.env.agents[0])
+        # Get first action space, assuming all agents have equal space
+        self.action_space = self.env.action_space(self.env.agents[0])
+        assert all(
+            self.env.observation_space(agent) == self.observation_space
+            for agent in self.env.agents
+        ), (
+            "Observation spaces for all agents must be identical. Perhaps "
+            "SuperSuit's pad_observations wrapper can help (useage: "
+            "`supersuit.aec_wrappers.pad_observations(env)`"
+        )
+        assert all(
+            self.env.action_space(agent) == self.action_space
+            for agent in self.env.agents
+        ), (
+            "Action spaces for all agents must be identical. Perhaps "
+            "SuperSuit's pad_action_space wrapper can help (usage: "
+            "`supersuit.aec_wrappers.pad_action_space(env)`"
+        )
+        self._agent_ids = set(self.env.agents)
+    def observe(self):
+        return {
+            self.env.agent_selection: self.env.observe(self.env.agent_selection),
+            "state": self.get_state(),
+        }
+    def reset(self, *args, **kwargs):
+        self.env.reset()
+        return (
+            {self.env.agent_selection: self.env.observe(self.env.agent_selection)},
+            {self.env.agent_selection: {}},
+        )
+    def step(self, action):
+        try:
+            self.env.step(action[self.env.agent_selection])
+        except (KeyError, IndexError):
+            self.env.step(action)
+        except AssertionError:
+            # Illegal action
+            print(action)
+            raise AssertionError("Illegal action")
+        obs_d = {}
+        rew_d = {}
+        done_d = {}
+        trunc_d = {}
+        info_d = {}
+        while self.env.agents:
+            obs, rew, done, trunc, info = self.env.last()
+            a = self.env.agent_selection
+            obs_d[a] = obs
+            rew_d[a] = rew
+            done_d[a] = done
+            trunc_d[a] = trunc
+            info_d[a] = info
+            if self.env.terminations[self.env.agent_selection]:
+                self.env.step(None)
+                done_d["__all__"] = True
+                trunc_d["__all__"] = True
+            else:
+                done_d["__all__"] = False
+                trunc_d["__all__"] = False
+                break
+        return obs_d, rew_d, done_d, trunc_d, info_d
+    def close(self):
+        self.env.close()
+    def seed(self, seed=None):
+        self.env.seed(seed)
+    def render(self, mode="human"):
+        return self.env.render(mode)
+    @property
+    def agent_selection(self):
+        return self.env.agent_selection
+    @property
+    def get_sub_environments(self):
+        return self.env.unwrapped
+    def get_state(self):
+        state = copy.deepcopy(self.env)
+        return state
+    def set_state(self, state):
+        self.env = copy.deepcopy(state)
+        return self.env.observe(self.env.agent_selection)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/rock_paper_scissors.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# __sphinx_doc_1_begin__
+import gymnasium as gym
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+class RockPaperScissors(MultiAgentEnv):
+    """Two-player environment for the famous rock paper scissors game.
+    # __sphinx_doc_1_end__
+    Optionally, the "Sheldon Cooper extension" can be activated by passing
+    `sheldon_cooper_mode=True` into the constructor, in which case two more moves
+    are allowed: Spock and Lizard. Spock is poisoned by Lizard, disproven by Paper, but
+    crushes Rock and smashes Scissors. Lizard poisons Spock and eats Paper, but is
+    decapitated by Scissors and crushed by Rock.
+    # __sphinx_doc_2_begin__
+    Both players always move simultaneously over a course of 10 timesteps in total.
+    The winner of each timestep receives reward of +1, the losing player -1.0.
+    The observation of each player is the last opponent action.
+    """
+    ROCK = 0
+    PAPER = 1
+    SCISSORS = 2
+    LIZARD = 3
+    SPOCK = 4
+    WIN_MATRIX = {
+        (ROCK, ROCK): (0, 0),
+        (ROCK, PAPER): (-1, 1),
+        (ROCK, SCISSORS): (1, -1),
+        (PAPER, ROCK): (1, -1),
+        (PAPER, PAPER): (0, 0),
+        (PAPER, SCISSORS): (-1, 1),
+        (SCISSORS, ROCK): (-1, 1),
+        (SCISSORS, PAPER): (1, -1),
+        (SCISSORS, SCISSORS): (0, 0),
+    }
+    # __sphinx_doc_2_end__
+    WIN_MATRIX.update(
+        {
+            # Sheldon Cooper mode:
+            (LIZARD, LIZARD): (0, 0),
+            (LIZARD, SPOCK): (1, -1),  # Lizard poisons Spock
+            (LIZARD, ROCK): (-1, 1),  # Rock crushes lizard
+            (LIZARD, PAPER): (1, -1),  # Lizard eats paper
+            (LIZARD, SCISSORS): (-1, 1),  # Scissors decapitate lizard
+            (ROCK, LIZARD): (1, -1),  # Rock crushes lizard
+            (PAPER, LIZARD): (-1, 1),  # Lizard eats paper
+            (SCISSORS, LIZARD): (1, -1),  # Scissors decapitate lizard
+            (SPOCK, SPOCK): (0, 0),
+            (SPOCK, LIZARD): (-1, 1),  # Lizard poisons Spock
+            (SPOCK, ROCK): (1, -1),  # Spock vaporizes rock
+            (SPOCK, PAPER): (-1, 1),  # Paper disproves Spock
+            (SPOCK, SCISSORS): (1, -1),  # Spock smashes scissors
+            (ROCK, SPOCK): (-1, 1),  # Spock vaporizes rock
+            (PAPER, SPOCK): (1, -1),  # Paper disproves Spock
+            (SCISSORS, SPOCK): (-1, 1),  # Spock smashes scissors
+        }
+    )
+    # __sphinx_doc_3_begin__
+    def __init__(self, config=None):
+        super().__init__()
+        self.agents = self.possible_agents = ["player1", "player2"]
+        # The observations are always the last taken actions. Hence observation- and
+        # action spaces are identical.
+        self.observation_spaces = self.action_spaces = {
+            "player1": gym.spaces.Discrete(3),
+            "player2": gym.spaces.Discrete(3),
+        }
+        self.last_move = None
+        self.num_moves = 0
+        # __sphinx_doc_3_end__
+        self.sheldon_cooper_mode = False
+        if config.get("sheldon_cooper_mode"):
+            self.sheldon_cooper_mode = True
+            self.action_spaces = self.observation_spaces = {
+                "player1": gym.spaces.Discrete(5),
+                "player2": gym.spaces.Discrete(5),
+            }
+    # __sphinx_doc_4_begin__
+    def reset(self, *, seed=None, options=None):
+        self.num_moves = 0
+        # The first observation should not matter (none of the agents has moved yet).
+        # Set them to 0.
+        return {
+            "player1": 0,
+            "player2": 0,
+        }, {}  # <- empty infos dict
+    # __sphinx_doc_4_end__
+    # __sphinx_doc_5_begin__
+    def step(self, action_dict):
+        self.num_moves += 1
+        move1 = action_dict["player1"]
+        move2 = action_dict["player2"]
+        # Set the next observations (simply use the other player's action).
+        # Note that because we are publishing both players in the observations dict,
+        # we expect both players to act in the next `step()` (simultaneous stepping).
+        observations = {"player1": move2, "player2": move1}
+        # Compute rewards for each player based on the win-matrix.
+        r1, r2 = self.WIN_MATRIX[move1, move2]
+        rewards = {"player1": r1, "player2": r2}
+        # Terminate the entire episode (for all agents) once 10 moves have been made.
+        terminateds = {"__all__": self.num_moves >= 10}
+        # Leave truncateds and infos empty.
+        return observations, rewards, terminateds, {}, {}
+# __sphinx_doc_5_end__

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/tic_tac_toe.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# __sphinx_doc_1_begin__
+import gymnasium as gym
+import numpy as np
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+class TicTacToe(MultiAgentEnv):
+    """A two-player game in which any player tries to complete one row in a 3x3 field.
+    The observation space is Box(0.0, 1.0, (9,)), where each index represents a distinct
+    field on a 3x3 board and values of 0.0 mean the field is empty, -1.0 means
+    the opponend owns the field, and 1.0 means we occupy the field:
+    ----------
+    | 0| 1| 2|
+    ----------
+    | 3| 4| 5|
+    ----------
+    | 6| 7| 8|
+    ----------
+    The action space is Discrete(9) and actions landing on an already occupied field
+    are simply ignored (and thus useless to the player taking these actions).
+    Once a player completes a row, they receive +1.0 reward, the losing player receives
+    -1.0 reward. In all other cases, both players receive 0.0 reward.
+    """
+    # __sphinx_doc_1_end__
+    # __sphinx_doc_2_begin__
+    def __init__(self, config=None):
+        super().__init__()
+        # Define the agents in the game.
+        self.agents = self.possible_agents = ["player1", "player2"]
+        # Each agent observes a 9D tensor, representing the 3x3 fields of the board.
+        # A 0 means an empty field, a 1 represents a piece of player 1, a -1 a piece of
+        # player 2.
+        self.observation_spaces = {
+            "player1": gym.spaces.Box(-1.0, 1.0, (9,), np.float32),
+            "player2": gym.spaces.Box(-1.0, 1.0, (9,), np.float32),
+        }
+        # Each player has 9 actions, encoding the 9 fields each player can place a piece
+        # on during their turn.
+        self.action_spaces = {
+            "player1": gym.spaces.Discrete(9),
+            "player2": gym.spaces.Discrete(9),
+        }
+        self.board = None
+        self.current_player = None
+    # __sphinx_doc_2_end__
+    # __sphinx_doc_3_begin__
+    def reset(self, *, seed=None, options=None):
+        self.board = [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ]
+        # Pick a random player to start the game.
+        self.current_player = np.random.choice(["player1", "player2"])
+        # Return observations dict (only with the starting player, which is the one
+        # we expect to act next).
+        return {
+            self.current_player: np.array(self.board, np.float32),
+        }, {}
+    # __sphinx_doc_3_end__
+    # __sphinx_doc_4_begin__
+    def step(self, action_dict):
+        action = action_dict[self.current_player]
+        # Create a rewards-dict (containing the rewards of the agent that just acted).
+        rewards = {self.current_player: 0.0}
+        # Create a terminateds-dict with the special `__all__` agent ID, indicating that
+        # if True, the episode ends for all agents.
+        terminateds = {"__all__": False}
+        opponent = "player1" if self.current_player == "player2" else "player2"
+        # Penalize trying to place a piece on an already occupied field.
+        if self.board[action] != 0:
+            rewards[self.current_player] -= 5.0
+        # Change the board according to the (valid) action taken.
+        else:
+            self.board[action] = 1 if self.current_player == "player1" else -1
+            # After having placed a new piece, figure out whether the current player
+            # won or not.
+            if self.current_player == "player1":
+                win_val = [1, 1, 1]
+            else:
+                win_val = [-1, -1, -1]
+            if (
+                # Horizontal win.
+                self.board[:3] == win_val
+                or self.board[3:6] == win_val
+                or self.board[6:] == win_val
+                # Vertical win.
+                or self.board[0:7:3] == win_val
+                or self.board[1:8:3] == win_val
+                or self.board[2:9:3] == win_val
+                # Diagonal win.
+                or self.board[::3] == win_val
+                or self.board[2:7:2] == win_val
+            ):
+                # Final reward is +5 for victory and -5 for a loss.
+                rewards[self.current_player] += 5.0
+                rewards[opponent] = -5.0
+                # Episode is done and needs to be reset for a new game.
+                terminateds["__all__"] = True
+            # The board might also be full w/o any player having won/lost.
+            # In this case, we simply end the episode and none of the players receives
+            # +1 or -1 reward.
+            elif 0 not in self.board:
+                terminateds["__all__"] = True
+        # Flip players and return an observations dict with only the next player to
+        # make a move in it.
+        self.current_player = opponent
+        return (
+            {self.current_player: np.array(self.board, np.float32)},
+            rewards,
+            terminateds,
+            {},
+            {},
+        )
+# __sphinx_doc_4_end__

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/two_step_game.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from gymnasium.spaces import Dict, Discrete, MultiDiscrete, Tuple
+import numpy as np
+from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE
+class TwoStepGame(MultiAgentEnv):
+    action_space = Discrete(2)
+    def __init__(self, env_config):
+        super().__init__()
+        self.action_space = Discrete(2)
+        self.state = None
+        self.agent_1 = 0
+        self.agent_2 = 1
+        # MADDPG emits action logits instead of actual discrete actions
+        self.actions_are_logits = env_config.get("actions_are_logits", False)
+        self.one_hot_state_encoding = env_config.get("one_hot_state_encoding", False)
+        self.with_state = env_config.get("separate_state_space", False)
+        self._agent_ids = {0, 1}
+        if not self.one_hot_state_encoding:
+            self.observation_space = Discrete(6)
+            self.with_state = False
+        else:
+            # Each agent gets the full state (one-hot encoding of which of the
+            # three states are active) as input with the receiving agent's
+            # ID (1 or 2) concatenated onto the end.
+            if self.with_state:
+                self.observation_space = Dict(
+                    {
+                        "obs": MultiDiscrete([2, 2, 2, 3]),
+                        ENV_STATE: MultiDiscrete([2, 2, 2]),
+                    }
+                )
+            else:
+                self.observation_space = MultiDiscrete([2, 2, 2, 3])
+    def reset(self, *, seed=None, options=None):
+        if seed is not None:
+            np.random.seed(seed)
+        self.state = np.array([1, 0, 0])
+        return self._obs(), {}
+    def step(self, action_dict):
+        if self.actions_are_logits:
+            action_dict = {
+                k: np.random.choice([0, 1], p=v) for k, v in action_dict.items()
+            }
+        state_index = np.flatnonzero(self.state)
+        if state_index == 0:
+            action = action_dict[self.agent_1]
+            assert action in [0, 1], action
+            if action == 0:
+                self.state = np.array([0, 1, 0])
+            else:
+                self.state = np.array([0, 0, 1])
+            global_rew = 0
+            terminated = False
+        elif state_index == 1:
+            global_rew = 7
+            terminated = True
+        else:
+            if action_dict[self.agent_1] == 0 and action_dict[self.agent_2] == 0:
+                global_rew = 0
+            elif action_dict[self.agent_1] == 1 and action_dict[self.agent_2] == 1:
+                global_rew = 8
+            else:
+                global_rew = 1
+            terminated = True
+        rewards = {self.agent_1: global_rew / 2.0, self.agent_2: global_rew / 2.0}
+        obs = self._obs()
+        terminateds = {"__all__": terminated}
+        truncateds = {"__all__": False}
+        infos = {
+            self.agent_1: {"done": terminateds["__all__"]},
+            self.agent_2: {"done": terminateds["__all__"]},
+        }
+        return obs, rewards, terminateds, truncateds, infos
+    def _obs(self):
+        if self.with_state:
+            return {
+                self.agent_1: {"obs": self.agent_1_obs(), ENV_STATE: self.state},
+                self.agent_2: {"obs": self.agent_2_obs(), ENV_STATE: self.state},
+            }
+        else:
+            return {self.agent_1: self.agent_1_obs(), self.agent_2: self.agent_2_obs()}
+    def agent_1_obs(self):
+        if self.one_hot_state_encoding:
+            return np.concatenate([self.state, [1]])
+        else:
+            return np.flatnonzero(self.state)[0]
+    def agent_2_obs(self):
+        if self.one_hot_state_encoding:
+            return np.concatenate([self.state, [2]])
+        else:
+            return np.flatnonzero(self.state)[0] + 3
+class TwoStepGameWithGroupedAgents(MultiAgentEnv):
+    def __init__(self, env_config):
+        super().__init__()
+        env = TwoStepGame(env_config)
+        tuple_obs_space = Tuple([env.observation_space, env.observation_space])
+        tuple_act_space = Tuple([env.action_space, env.action_space])
+        self._agent_ids = {"agents"}
+        self.env = env.with_agent_groups(
+            groups={"agents": [0, 1]},
+            obs_space=tuple_obs_space,
+            act_space=tuple_act_space,
+        )
+        self.observation_space = Dict({"agents": self.env.observation_space})
+        self.action_space = Dict({"agents": self.env.action_space})
+    def reset(self, *, seed=None, options=None):
+        return self.env.reset(seed=seed, options=options)
+    def step(self, actions):
+        return self.env.step(actions)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/nested_space_repeat_after_me_env.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import gymnasium as gym
+from gymnasium.spaces import Box, Dict, Discrete, Tuple
+import numpy as np
+import tree  # pip install dm_tree
+from ray.rllib.utils.spaces.space_utils import flatten_space
+class NestedSpaceRepeatAfterMeEnv(gym.Env):
+    """Env for which policy has to repeat the (possibly complex) observation.
+    The action space and observation spaces are always the same and may be
+    arbitrarily nested Dict/Tuple Spaces.
+    Rewards are given for exactly matching Discrete sub-actions and for being
+    as close as possible for Box sub-actions.
+    """
+    def __init__(self, config=None):
+        config = config or {}
+        self.observation_space = config.get(
+            "space", Tuple([Discrete(2), Dict({"a": Box(-1.0, 1.0, (2,))})])
+        )
+        self.action_space = self.observation_space
+        self.flattened_action_space = flatten_space(self.action_space)
+        self.episode_len = config.get("episode_len", 100)
+    def reset(self, *, seed=None, options=None):
+        self.steps = 0
+        return self._next_obs(), {}
+    def step(self, action):
+        self.steps += 1
+        action = tree.flatten(action)
+        reward = 0.0
+        for a, o, space in zip(
+            action, self.current_obs_flattened, self.flattened_action_space
+        ):
+            # Box: -abs(diff).
+            if isinstance(space, gym.spaces.Box):
+                reward -= np.sum(np.abs(a - o))
+            # Discrete: +1.0 if exact match.
+            if isinstance(space, gym.spaces.Discrete):
+                reward += 1.0 if a == o else 0.0
+        truncated = self.steps >= self.episode_len
+        return self._next_obs(), reward, False, truncated, {}
+    def _next_obs(self):
+        self.current_obs = self.observation_space.sample()
+        self.current_obs_flattened = tree.flatten(self.current_obs)
+        return self.current_obs

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/parametric_actions_cartpole.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import random
+import gymnasium as gym
+import numpy as np
+from gymnasium.spaces import Box, Dict, Discrete
+class ParametricActionsCartPole(gym.Env):
+    """Parametric action version of CartPole.
+    In this env there are only ever two valid actions, but we pretend there are
+    actually up to `max_avail_actions` actions that can be taken, and the two
+    valid actions are randomly hidden among this set.
+    At each step, we emit a dict of:
+        - the actual cart observation
+        - a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail)
+        - the list of action embeddings (w/ zeroes for invalid actions) (e.g.,
+            [[0, 0],
+             [0, 0],
+             [-0.2322, -0.2569],
+             [0, 0],
+             [0, 0],
+             [0.7878, 1.2297]] for max_avail_actions=6)
+    In a real environment, the actions embeddings would be larger than two
+    units of course, and also there would be a variable number of valid actions
+    per step instead of always [LEFT, RIGHT].
+    """
+    def __init__(self, max_avail_actions):
+        # Use simple random 2-unit action embeddings for [LEFT, RIGHT]
+        self.left_action_embed = np.random.randn(2)
+        self.right_action_embed = np.random.randn(2)
+        self.action_space = Discrete(max_avail_actions)
+        self.wrapped = gym.make("CartPole-v1")
+        self.observation_space = Dict(
+            {
+                "action_mask": Box(0, 1, shape=(max_avail_actions,), dtype=np.int8),
+                "avail_actions": Box(-10, 10, shape=(max_avail_actions, 2)),
+                "cart": self.wrapped.observation_space,
+            }
+        )
+    def update_avail_actions(self):
+        self.action_assignments = np.array(
+            [[0.0, 0.0]] * self.action_space.n, dtype=np.float32
+        )
+        self.action_mask = np.array([0.0] * self.action_space.n, dtype=np.int8)
+        self.left_idx, self.right_idx = random.sample(range(self.action_space.n), 2)
+        self.action_assignments[self.left_idx] = self.left_action_embed
+        self.action_assignments[self.right_idx] = self.right_action_embed
+        self.action_mask[self.left_idx] = 1
+        self.action_mask[self.right_idx] = 1
+    def reset(self, *, seed=None, options=None):
+        self.update_avail_actions()
+        obs, infos = self.wrapped.reset()
+        return {
+            "action_mask": self.action_mask,
+            "avail_actions": self.action_assignments,
+            "cart": obs,
+        }, infos
+    def step(self, action):
+        if action == self.left_idx:
+            actual_action = 0
+        elif action == self.right_idx:
+            actual_action = 1
+        else:
+            raise ValueError(
+                "Chosen action was not one of the non-zero action embeddings",
+                action,
+                self.action_assignments,
+                self.action_mask,
+                self.left_idx,
+                self.right_idx,
+            )
+        orig_obs, rew, done, truncated, info = self.wrapped.step(actual_action)
+        self.update_avail_actions()
+        self.action_mask = self.action_mask.astype(np.int8)
+        obs = {
+            "action_mask": self.action_mask,
+            "avail_actions": self.action_assignments,
+            "cart": orig_obs,
+        }
+        return obs, rew, done, truncated, info
+class ParametricActionsCartPoleNoEmbeddings(gym.Env):
+    """Same as the above ParametricActionsCartPole.
+    However, action embeddings are not published inside observations,
+    but will be learnt by the model.
+    At each step, we emit a dict of:
+        - the actual cart observation
+        - a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail)
+        - action embeddings (w/ "dummy embedding" for invalid actions) are
+          outsourced in the model and will be learned.
+    """
+    def __init__(self, max_avail_actions):
+        # Randomly set which two actions are valid and available.
+        self.left_idx, self.right_idx = random.sample(range(max_avail_actions), 2)
+        self.valid_avail_actions_mask = np.array(
+            [0.0] * max_avail_actions, dtype=np.int8
+        )
+        self.valid_avail_actions_mask[self.left_idx] = 1
+        self.valid_avail_actions_mask[self.right_idx] = 1
+        self.action_space = Discrete(max_avail_actions)
+        self.wrapped = gym.make("CartPole-v1")
+        self.observation_space = Dict(
+            {
+                "valid_avail_actions_mask": Box(0, 1, shape=(max_avail_actions,)),
+                "cart": self.wrapped.observation_space,
+            }
+        )
+    def reset(self, *, seed=None, options=None):
+        obs, infos = self.wrapped.reset()
+        return {
+            "valid_avail_actions_mask": self.valid_avail_actions_mask,
+            "cart": obs,
+        }, infos
+    def step(self, action):
+        if action == self.left_idx:
+            actual_action = 0
+        elif action == self.right_idx:
+            actual_action = 1
+        else:
+            raise ValueError(
+                "Chosen action was not one of the non-zero action embeddings",
+                action,
+                self.valid_avail_actions_mask,
+                self.left_idx,
+                self.right_idx,
+            )
+        orig_obs, rew, done, truncated, info = self.wrapped.step(actual_action)
+        obs = {
+            "valid_avail_actions_mask": self.valid_avail_actions_mask,
+            "cart": orig_obs,
+        }
+        return obs, rew, done, truncated, info

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/random_env.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import copy
+import gymnasium as gym
+from gymnasium.spaces import Discrete, Tuple
+import numpy as np
+from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent
+class RandomEnv(gym.Env):
+    """A randomly acting environment.
+    Can be instantiated with arbitrary action-, observation-, and reward
+    spaces. Observations and rewards are generated by simply sampling from the
+    observation/reward spaces. The probability of a `terminated=True` after each
+    action can be configured, as well as the max episode length.
+    """
+    def __init__(self, config=None):
+        config = config or {}
+        # Action space.
+        self.action_space = config.get("action_space", Discrete(2))
+        # Observation space from which to sample.
+        self.observation_space = config.get("observation_space", Discrete(2))
+        # Reward space from which to sample.
+        self.reward_space = config.get(
+            "reward_space",
+            gym.spaces.Box(low=-1.0, high=1.0, shape=(), dtype=np.float32),
+        )
+        self.static_samples = config.get("static_samples", False)
+        if self.static_samples:
+            self.observation_sample = self.observation_space.sample()
+            self.reward_sample = self.reward_space.sample()
+        # Chance that an episode ends at any step.
+        # Note that a max episode length can be specified via
+        # `max_episode_len`.
+        self.p_terminated = config.get("p_terminated")
+        if self.p_terminated is None:
+            self.p_terminated = config.get("p_done", 0.1)
+        # A max episode length. Even if the `p_terminated` sampling does not lead
+        # to a terminus, the episode will end after at most this many
+        # timesteps.
+        # Set to 0 or None for using no limit on the episode length.
+        self.max_episode_len = config.get("max_episode_len", None)
+        # Whether to check action bounds.
+        self.check_action_bounds = config.get("check_action_bounds", False)
+        # Steps taken so far (after last reset).
+        self.steps = 0
+    def reset(self, *, seed=None, options=None):
+        self.steps = 0
+        if not self.static_samples:
+            return self.observation_space.sample(), {}
+        else:
+            return copy.deepcopy(self.observation_sample), {}
+    def step(self, action):
+        if self.check_action_bounds and not self.action_space.contains(action):
+            raise ValueError(
+                "Illegal action for {}: {}".format(self.action_space, action)
+            )
+        if isinstance(self.action_space, Tuple) and len(action) != len(
+            self.action_space.spaces
+        ):
+            raise ValueError(
+                "Illegal action for {}: {}".format(self.action_space, action)
+            )
+        self.steps += 1
+        terminated = False
+        truncated = False
+        # We are `truncated` as per our max-episode-len.
+        if self.max_episode_len and self.steps >= self.max_episode_len:
+            truncated = True
+        # Max episode length not reached yet -> Sample `terminated` via `p_terminated`.
+        elif self.p_terminated > 0.0:
+            terminated = bool(
+                np.random.choice(
+                    [True, False], p=[self.p_terminated, 1.0 - self.p_terminated]
+                )
+            )
+        if not self.static_samples:
+            return (
+                self.observation_space.sample(),
+                self.reward_space.sample(),
+                terminated,
+                truncated,
+                {},
+            )
+        else:
+            return (
+                copy.deepcopy(self.observation_sample),
+                copy.deepcopy(self.reward_sample),
+                terminated,
+                truncated,
+                {},
+            )
+# Multi-agent version of the RandomEnv.
+RandomMultiAgentEnv = make_multi_agent(lambda c: RandomEnv(c))
+# Large observation space "pre-compiled" random env (for testing).
+class RandomLargeObsSpaceEnv(RandomEnv):
+    def __init__(self, config=None):
+        config = config or {}
+        config.update({"observation_space": gym.spaces.Box(-1.0, 1.0, (5000,))})
+        super().__init__(config=config)
+# Large observation space + cont. actions "pre-compiled" random env
+# (for testing).
+class RandomLargeObsSpaceEnvContActions(RandomEnv):
+    def __init__(self, config=None):
+        config = config or {}
+        config.update(
+            {
+                "observation_space": gym.spaces.Box(-1.0, 1.0, (5000,)),
+                "action_space": gym.spaces.Box(-1.0, 1.0, (5,)),
+            }
+        )
+        super().__init__(config=config)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/recommender_system_envs_with_recsim.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Examples for RecSim envs ready to be used by RLlib Algorithms.
+RecSim is a configurable recommender systems simulation platform.
+Source: https://github.com/google-research/recsim
+"""
+from recsim import choice_model
+from recsim.environments import (
+    long_term_satisfaction as lts,
+    interest_evolution as iev,
+    interest_exploration as iex,
+)
+from ray.rllib.env.wrappers.recsim import make_recsim_env
+from ray.tune import register_env
+# Some built-in RecSim envs to test with.
+# ---------------------------------------
+# Long-term satisfaction env: User has to pick from items that are either
+# a) unhealthy, but taste good, or b) healthy, but have bad taste.
+# Best strategy is to pick a mix of both to ensure long-term
+# engagement.
+def lts_user_model_creator(env_ctx):
+    return lts.LTSUserModel(
+        env_ctx["slate_size"],
+        user_state_ctor=lts.LTSUserState,
+        response_model_ctor=lts.LTSResponse,
+    )
+def lts_document_sampler_creator(env_ctx):
+    return lts.LTSDocumentSampler()
+LongTermSatisfactionRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=lts_user_model_creator,
+    recsim_document_sampler_creator=lts_document_sampler_creator,
+    reward_aggregator=lts.clicked_engagement_reward,
+)
+# Interest exploration env: Models the problem of active exploration
+# of user interests. It is meant to illustrate popularity bias in
+# recommender systems, where myopic maximization of engagement leads
+# to bias towards documents that have wider appeal,
+# whereas niche user interests remain unexplored.
+def iex_user_model_creator(env_ctx):
+    return iex.IEUserModel(
+        env_ctx["slate_size"],
+        user_state_ctor=iex.IEUserState,
+        response_model_ctor=iex.IEResponse,
+        seed=env_ctx["seed"],
+    )
+def iex_document_sampler_creator(env_ctx):
+    return iex.IETopicDocumentSampler(seed=env_ctx["seed"])
+InterestExplorationRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=iex_user_model_creator,
+    recsim_document_sampler_creator=iex_document_sampler_creator,
+    reward_aggregator=iex.total_clicks_reward,
+)
+# Interest evolution env: See https://github.com/google-research/recsim
+# for more information.
+def iev_user_model_creator(env_ctx):
+    return iev.IEvUserModel(
+        env_ctx["slate_size"],
+        choice_model_ctor=choice_model.MultinomialProportionalChoiceModel,
+        response_model_ctor=iev.IEvResponse,
+        user_state_ctor=iev.IEvUserState,
+        seed=env_ctx["seed"],
+    )
+# Extend IEvVideo to fix a bug caused by None cluster_ids.
+class SingleClusterIEvVideo(iev.IEvVideo):
+    def __init__(self, doc_id, features, video_length=None, quality=None):
+        super(SingleClusterIEvVideo, self).__init__(
+            doc_id=doc_id,
+            features=features,
+            cluster_id=0,  # single cluster.
+            video_length=video_length,
+            quality=quality,
+        )
+def iev_document_sampler_creator(env_ctx):
+    return iev.UtilityModelVideoSampler(doc_ctor=iev.IEvVideo, seed=env_ctx["seed"])
+InterestEvolutionRecSimEnv = make_recsim_env(
+    recsim_user_model_creator=iev_user_model_creator,
+    recsim_document_sampler_creator=iev_document_sampler_creator,
+    reward_aggregator=iev.clicked_watchtime_reward,
+)
+# Backward compatibility.
+register_env(
+    name="RecSim-v1", env_creator=lambda env_ctx: InterestEvolutionRecSimEnv(env_ctx)
+)

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_after_me_env.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+class RepeatAfterMeEnv(gym.Env):
+    """Env in which the observation at timestep minus n must be repeated."""
+    def __init__(self, config=None):
+        config = config or {}
+        if config.get("continuous"):
+            self.observation_space = Box(-1.0, 1.0, (2,))
+        else:
+            self.observation_space = Discrete(2)
+        self.action_space = self.observation_space
+        # Note: Set `repeat_delay` to 0 for simply repeating the seen
+        # observation (no delay).
+        self.delay = config.get("repeat_delay", 1)
+        self.episode_len = config.get("episode_len", 100)
+        self.history = []
+    def reset(self, *, seed=None, options=None):
+        self.history = [0] * self.delay
+        return self._next_obs(), {}
+    def step(self, action):
+        obs = self.history[-(1 + self.delay)]
+        reward = 0.0
+        # Box: -abs(diff).
+        if isinstance(self.action_space, Box):
+            reward = -np.sum(np.abs(action - obs))
+        # Discrete: +1.0 if exact match, -1.0 otherwise.
+        if isinstance(self.action_space, Discrete):
+            reward = 1.0 if action == obs else -1.0
+        done = truncated = len(self.history) > self.episode_len
+        return self._next_obs(), reward, done, truncated, {}
+    def _next_obs(self):
+        if isinstance(self.observation_space, Box):
+            token = np.random.random(size=(2,))
+        else:
+            token = np.random.choice([0, 1])
+        self.history.append(token)
+        return token

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_initial_obs_env.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gymnasium as gym
+from gymnasium.spaces import Discrete
+import random
+class RepeatInitialObsEnv(gym.Env):
+    """Env in which the initial observation has to be repeated all the time.
+    Runs for n steps.
+    r=1 if action correct, -1 otherwise (max. R=100).
+    """
+    def __init__(self, episode_len=100):
+        self.observation_space = Discrete(2)
+        self.action_space = Discrete(2)
+        self.token = None
+        self.episode_len = episode_len
+        self.num_steps = 0
+    def reset(self, *, seed=None, options=None):
+        self.token = random.choice([0, 1])
+        self.num_steps = 0
+        return self.token, {}
+    def step(self, action):
+        if action == self.token:
+            reward = 1
+        else:
+            reward = -1
+        self.num_steps += 1
+        done = truncated = self.num_steps >= self.episode_len
+        return 0, reward, done, truncated, {}

.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_corridor.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import gymnasium as gym
+from gymnasium.spaces import Box, Discrete
+import numpy as np
+class SimpleCorridor(gym.Env):
+    """Example of a custom env in which you have to walk down a corridor.
+    You can configure the length of the corridor via the env config."""
+    def __init__(self, config=None):
+        config = config or {}
+        self.action_space = Discrete(2)
+        self.observation_space = Box(0.0, 999.0, shape=(1,), dtype=np.float32)
+        self.set_corridor_length(config.get("corridor_length", 10))
+        self._cur_pos = 0
+    def set_corridor_length(self, length):
+        self.end_pos = length
+        print(f"Set corridor length to {self.end_pos}")
+        assert self.end_pos <= 999, "The maximum `corridor_length` allowed is 999!"
+    def reset(self, *, seed=None, options=None):
+        self._cur_pos = 0.0
+        return self._get_obs(), {}
+    def step(self, action):
+        assert action in [0, 1], action
+        if action == 0 and self._cur_pos > 0:
+            self._cur_pos -= 1.0
+        elif action == 1:
+            self._cur_pos += 1.0
+        terminated = self._cur_pos >= self.end_pos
+        truncated = False
+        reward = 1.0 if terminated else -0.01
+        return self._get_obs(), reward, terminated, truncated, {}
+    def _get_obs(self):
+        return np.array([self._cur_pos], np.float32)