diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56d9db36f3328edaede6e2d16b70ffe343bb9a1b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99f853cceaf3384012edec5959d10dd3748d205b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_torch.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_torch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..699fa476eacc561b5e114ef158d0dbb5280fef27 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/__pycache__/vtrace_torch.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c3270e6ba6fc4c3d8e77f7fbf841774a3b7e230 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/impala_torch_learner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/impala_torch_learner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e2a3e60f7a2ebea7f8e97803a4619697b5284cc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/impala_torch_learner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/vtrace_torch_v2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/vtrace_torch_v2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e1c120d6fb3471431e7d711b5b628cbde4ce1b7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/__pycache__/vtrace_torch_v2.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/impala_torch_learner.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/impala_torch_learner.py new file mode 100644 index 0000000000000000000000000000000000000000..256e3b48fb79f217ec66a9a9fdf7ec7a9a0ca6ae --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/impala_torch_learner.py @@ -0,0 +1,164 @@ +from typing import Dict + +from ray.rllib.algorithms.impala.impala import IMPALAConfig +from ray.rllib.algorithms.impala.impala_learner import IMPALALearner +from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import ( + vtrace_torch, + make_time_major, +) +from ray.rllib.core.columns import Columns +from ray.rllib.core.learner.learner import ENTROPY_KEY +from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import ModuleID, TensorType + +torch, nn = try_import_torch() + + +class IMPALATorchLearner(IMPALALearner, TorchLearner): + """Implements the IMPALA loss function in torch.""" + + @override(TorchLearner) + def compute_loss_for_module( + self, + *, + module_id: ModuleID, + config: IMPALAConfig, + batch: Dict, + fwd_out: Dict[str, TensorType], + ) -> TensorType: + module = self.module[module_id].unwrapped() + + # TODO (sven): Now that we do the +1ts trick to be less vulnerable about + # bootstrap values at the end of rollouts in the new stack, we might make + # this a more flexible, configurable parameter for users, e.g. + # `v_trace_seq_len` (independent of `rollout_fragment_length`). Separation + # of concerns (sampling vs learning). + rollout_frag_or_episode_len = config.get_rollout_fragment_length() + recurrent_seq_len = batch.get("seq_lens") + + loss_mask = batch[Columns.LOSS_MASK].float() + loss_mask_time_major = make_time_major( + loss_mask, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + size_loss_mask = torch.sum(loss_mask) + + # Behavior actions logp and target actions logp. + behaviour_actions_logp = batch[Columns.ACTION_LOGP] + target_policy_dist = module.get_train_action_dist_cls().from_logits( + fwd_out[Columns.ACTION_DIST_INPUTS] + ) + target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS]) + + # Values and bootstrap values. + values = module.compute_values( + batch, embeddings=fwd_out.get(Columns.EMBEDDINGS) + ) + values_time_major = make_time_major( + values, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + assert Columns.VALUES_BOOTSTRAPPED not in batch + # Use as bootstrap values the vf-preds in the next "batch row", except + # for the very last row (which doesn't have a next row), for which the + # bootstrap value does not matter b/c it has a +1ts value at its end + # anyways. So we chose an arbitrary item (for simplicity of not having to + # move new data to the device). + bootstrap_values = torch.cat( + [ + values_time_major[0][1:], # 0th ts values from "next row" + values_time_major[0][0:1], # <- can use any arbitrary value here + ], + dim=0, + ) + + # TODO(Artur): In the old impala code, actions were unsqueezed if they were + # multi_discrete. Find out why and if we need to do the same here. + # actions = actions if is_multidiscrete else torch.unsqueeze(actions, dim=1) + target_actions_logp_time_major = make_time_major( + target_actions_logp, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + behaviour_actions_logp_time_major = make_time_major( + behaviour_actions_logp, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + rewards_time_major = make_time_major( + batch[Columns.REWARDS], + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + + # the discount factor that is used should be gamma except for timesteps where + # the episode is terminated. In that case, the discount factor should be 0. + discounts_time_major = ( + 1.0 + - make_time_major( + batch[Columns.TERMINATEDS], + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ).type(dtype=torch.float32) + ) * config.gamma + + # Note that vtrace will compute the main loop on the CPU for better performance. + vtrace_adjusted_target_values, pg_advantages = vtrace_torch( + target_action_log_probs=target_actions_logp_time_major, + behaviour_action_log_probs=behaviour_actions_logp_time_major, + discounts=discounts_time_major, + rewards=rewards_time_major, + values=values_time_major, + bootstrap_values=bootstrap_values, + clip_rho_threshold=config.vtrace_clip_rho_threshold, + clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold, + ) + + # The policy gradients loss. + pi_loss = -torch.sum( + target_actions_logp_time_major * pg_advantages * loss_mask_time_major + ) + mean_pi_loss = pi_loss / size_loss_mask + + # The baseline loss. + delta = values_time_major - vtrace_adjusted_target_values + vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major) + mean_vf_loss = vf_loss / size_loss_mask + + # The entropy loss. + entropy_loss = -torch.sum(target_policy_dist.entropy() * loss_mask) + mean_entropy_loss = entropy_loss / size_loss_mask + + # The summed weighted loss. + total_loss = ( + mean_pi_loss + + mean_vf_loss * config.vf_loss_coeff + + ( + mean_entropy_loss + * self.entropy_coeff_schedulers_per_module[ + module_id + ].get_current_value() + ) + ) + + # Log important loss stats. + self.metrics.log_dict( + { + "pi_loss": pi_loss, + "mean_pi_loss": mean_pi_loss, + "vf_loss": vf_loss, + "mean_vf_loss": mean_vf_loss, + ENTROPY_KEY: -mean_entropy_loss, + }, + key=module_id, + window=1, # <- single items (should not be mean/ema-reduced over time). + ) + # Return the total loss. + return total_loss + + +ImpalaTorchLearner = IMPALATorchLearner diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/vtrace_torch_v2.py b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/vtrace_torch_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..2bd40786b3a291e45c139ff93750506ed314ff71 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/algorithms/impala/torch/vtrace_torch_v2.py @@ -0,0 +1,168 @@ +from typing import List, Union +from ray.rllib.utils.framework import try_import_torch + +torch, nn = try_import_torch() + + +def make_time_major( + tensor: Union["torch.Tensor", List["torch.Tensor"]], + *, + trajectory_len: int = None, + recurrent_seq_len: int = None, +): + """Swaps batch and trajectory axis. + + Args: + tensor: A tensor or list of tensors to swap the axis of. + NOTE: Each tensor must have the shape [B * T] where B is the batch size and + T is the trajectory length. + trajectory_len: The length of each trajectory being transformed. + If None then `recurrent_seq_len` must be set. + recurrent_seq_len: Sequence lengths if recurrent. + If None then `trajectory_len` must be set. + + Returns: + res: A tensor with swapped axes or a list of tensors with + swapped axes. + """ + if isinstance(tensor, (list, tuple)): + return [ + make_time_major(_tensor, trajectory_len, recurrent_seq_len) + for _tensor in tensor + ] + + assert ( + trajectory_len is not None or recurrent_seq_len is not None + ), "Either trajectory_len or recurrent_seq_len must be set." + + # Figure out the sizes of the final B and T axes. + if recurrent_seq_len is not None: + assert len(tensor.shape) == 2 + # Swap B and T axes. + tensor = torch.transpose(tensor, 1, 0) + return tensor + else: + T = trajectory_len + # Zero-pad, if necessary. + tensor_0 = tensor.shape[0] + B = tensor_0 // T + if B != (tensor_0 / T): + assert len(tensor.shape) == 1 + tensor = torch.cat( + [ + tensor, + torch.zeros( + trajectory_len - tensor_0 % T, + dtype=tensor.dtype, + device=tensor.device, + ), + ] + ) + B += 1 + + # Reshape tensor (break up B axis into 2 axes: B and T). + tensor = torch.reshape(tensor, [B, T] + list(tensor.shape[1:])) + + # Swap B and T axes. + tensor = torch.transpose(tensor, 1, 0) + + return tensor + + +def vtrace_torch( + *, + target_action_log_probs: "torch.Tensor", + behaviour_action_log_probs: "torch.Tensor", + discounts: "torch.Tensor", + rewards: "torch.Tensor", + values: "torch.Tensor", + bootstrap_values: "torch.Tensor", + clip_rho_threshold: Union[float, "torch.Tensor"] = 1.0, + clip_pg_rho_threshold: Union[float, "torch.Tensor"] = 1.0, +): + """V-trace for softmax policies implemented with torch. + + Calculates V-trace actor critic targets for softmax polices as described in + "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner + Architectures" by Espeholt, Soyer, Munos et al. (https://arxiv.org/abs/1802.01561) + + The V-trace implementation used here closely resembles the one found in the + scalable-agent repository by Google DeepMind, available at + https://github.com/deepmind/scalable_agent. This version has been optimized to + minimize the number of floating-point operations required per V-Trace + calculation, achieved through the use of dynamic programming techniques. It's + important to note that the mathematical expressions used in this implementation + may appear quite different from those presented in the IMPALA paper. + + The following terminology applies: + - `target policy` refers to the policy we are interested in improving. + - `behaviour policy` refers to the policy that generated the given + rewards and actions. + - `T` refers to the time dimension. This is usually either the length of the + trajectory or the length of the sequence if recurrent. + - `B` refers to the batch size. + + Args: + target_action_log_probs: Action log probs from the target policy. A float32 + tensor of shape [T, B]. + behaviour_action_log_probs: Action log probs from the behaviour policy. A + float32 tensor of shape [T, B]. + discounts: A float32 tensor of shape [T, B] with the discount encountered when + following the behaviour policy. This will be 0 for terminal timesteps + (done=True) and gamma (the discount factor) otherwise. + rewards: A float32 tensor of shape [T, B] with the rewards generated by + following the behaviour policy. + values: A float32 tensor of shape [T, B] with the value function estimates + wrt. the target policy. + bootstrap_values: A float32 of shape [B] with the value function estimate at + time T. + clip_rho_threshold: A scalar float32 tensor with the clipping threshold for + importance weights (rho) when calculating the baseline targets (vs). + rho^bar in the paper. + clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold + on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). + """ + log_rhos = target_action_log_probs - behaviour_action_log_probs + + rhos = torch.exp(log_rhos) + if clip_rho_threshold is not None: + clipped_rhos = torch.clamp(rhos, max=clip_rho_threshold) + else: + clipped_rhos = rhos + + cs = torch.clamp(rhos, max=1.0) + # Append bootstrapped value to get [v1, ..., v_t+1] + values_t_plus_1 = torch.cat( + [values[1:], torch.unsqueeze(bootstrap_values, 0)], axis=0 + ) + + deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values) + + # Only move the for-loop to CPU. + discounts_cpu = discounts.to("cpu") + cs_cpu = cs.to("cpu") + deltas_cpu = deltas.to("cpu") + vs_minus_v_xs_cpu = [torch.zeros_like(bootstrap_values, device="cpu")] + for i in reversed(range(len(discounts_cpu))): + discount_t, c_t, delta_t = discounts_cpu[i], cs_cpu[i], deltas_cpu[i] + vs_minus_v_xs_cpu.append(delta_t + discount_t * c_t * vs_minus_v_xs_cpu[-1]) + vs_minus_v_xs_cpu = torch.stack(vs_minus_v_xs_cpu[1:]) + # Move results back to GPU - if applicable. + vs_minus_v_xs = vs_minus_v_xs_cpu.to(deltas.device) + + # Reverse the results back to original order. + vs_minus_v_xs = torch.flip(vs_minus_v_xs, dims=[0]) + + # Add V(x_s) to get v_s. + vs = torch.add(vs_minus_v_xs, values) + + # Advantage for policy gradient. + vs_t_plus_1 = torch.cat([vs[1:], torch.unsqueeze(bootstrap_values, 0)], axis=0) + if clip_pg_rho_threshold is not None: + clipped_pg_rhos = torch.clamp(rhos, max=clip_pg_rho_threshold) + else: + clipped_pg_rhos = rhos + pg_advantages = clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values) + + # Make sure no gradients backpropagated through the returned values. + return torch.detach(vs), torch.detach(pg_advantages) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/centralized_critic.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/centralized_critic.py new file mode 100644 index 0000000000000000000000000000000000000000..14380b789908417052a85dc30df2638105f05453 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/centralized_critic.py @@ -0,0 +1,319 @@ +# @OldAPIStack + +# *********************************************************************************** +# IMPORTANT NOTE: This script uses the old API stack and will soon be replaced by +# `ray.rllib.examples.multi_agent.pettingzoo_shared_value_function.py`! +# *********************************************************************************** + +"""An example of customizing PPO to leverage a centralized critic. + +Here the model and policy are hard-coded to implement a centralized critic +for TwoStepGame, but you can adapt this for your own use cases. + +Compared to simply running `rllib/examples/two_step_game.py --run=PPO`, +this centralized critic version reaches vf_explained_variance=1.0 more stably +since it takes into account the opponent actions as well as the policy's. +Note that this is also using two independent policies instead of weight-sharing +with one. + +See also: centralized_critic_2.py for a simpler approach that instead +modifies the environment. +""" + +import argparse +from gymnasium.spaces import Discrete +import numpy as np +import os + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.ppo.ppo import PPO, PPOConfig +from ray.rllib.algorithms.ppo.ppo_tf_policy import ( + PPOTF1Policy, + PPOTF2Policy, +) +from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy +from ray.rllib.evaluation.postprocessing import compute_advantages, Postprocessing +from ray.rllib.examples.envs.classes.multi_agent.two_step_game import TwoStepGame +from ray.rllib.examples._old_api_stack.models.centralized_critic_models import ( + CentralizedCriticModel, + TorchCentralizedCriticModel, +) +from ray.rllib.models import ModelCatalog +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.rllib.utils.tf_utils import explained_variance, make_tf_callable +from ray.rllib.utils.torch_utils import convert_to_torch_tensor + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + +OPPONENT_OBS = "opponent_obs" +OPPONENT_ACTION = "opponent_action" + +parser = argparse.ArgumentParser() +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=100, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", type=float, default=7.99, help="Reward at which we stop training." +) + + +class CentralizedValueMixin: + """Add method to evaluate the central value function from the model.""" + + def __init__(self): + if self.config["framework"] != "torch": + self.compute_central_vf = make_tf_callable(self.get_session())( + self.model.central_value_function + ) + else: + self.compute_central_vf = self.model.central_value_function + + +# Grabs the opponent obs/act and includes it in the experience train_batch, +# and computes GAE using the central vf predictions. +def centralized_critic_postprocessing( + policy, sample_batch, other_agent_batches=None, episode=None +): + pytorch = policy.config["framework"] == "torch" + if (pytorch and hasattr(policy, "compute_central_vf")) or ( + not pytorch and policy.loss_initialized() + ): + assert other_agent_batches is not None + [(_, _, opponent_batch)] = list(other_agent_batches.values()) + + # also record the opponent obs and actions in the trajectory + sample_batch[OPPONENT_OBS] = opponent_batch[SampleBatch.CUR_OBS] + sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS] + + # overwrite default VF prediction with the central VF + if args.framework == "torch": + sample_batch[SampleBatch.VF_PREDS] = ( + policy.compute_central_vf( + convert_to_torch_tensor( + sample_batch[SampleBatch.CUR_OBS], policy.device + ), + convert_to_torch_tensor(sample_batch[OPPONENT_OBS], policy.device), + convert_to_torch_tensor( + sample_batch[OPPONENT_ACTION], policy.device + ), + ) + .cpu() + .detach() + .numpy() + ) + else: + sample_batch[SampleBatch.VF_PREDS] = convert_to_numpy( + policy.compute_central_vf( + sample_batch[SampleBatch.CUR_OBS], + sample_batch[OPPONENT_OBS], + sample_batch[OPPONENT_ACTION], + ) + ) + else: + # Policy hasn't been initialized yet, use zeros. + sample_batch[OPPONENT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS]) + sample_batch[OPPONENT_ACTION] = np.zeros_like(sample_batch[SampleBatch.ACTIONS]) + sample_batch[SampleBatch.VF_PREDS] = np.zeros_like( + sample_batch[SampleBatch.REWARDS], dtype=np.float32 + ) + + completed = sample_batch[SampleBatch.TERMINATEDS][-1] + if completed: + last_r = 0.0 + else: + last_r = sample_batch[SampleBatch.VF_PREDS][-1] + + train_batch = compute_advantages( + sample_batch, + last_r, + policy.config["gamma"], + policy.config["lambda"], + use_gae=policy.config["use_gae"], + ) + return train_batch + + +# Copied from PPO but optimizing the central value function. +def loss_with_central_critic(policy, base_policy, model, dist_class, train_batch): + # Save original value function. + vf_saved = model.value_function + + # Calculate loss with a custom value function. + model.value_function = lambda: policy.model.central_value_function( + train_batch[SampleBatch.CUR_OBS], + train_batch[OPPONENT_OBS], + train_batch[OPPONENT_ACTION], + ) + policy._central_value_out = model.value_function() + loss = base_policy.loss(model, dist_class, train_batch) + + # Restore original value function. + model.value_function = vf_saved + + return loss + + +def central_vf_stats(policy, train_batch): + # Report the explained variance of the central value function. + return { + "vf_explained_var": explained_variance( + train_batch[Postprocessing.VALUE_TARGETS], policy._central_value_out + ) + } + + +def get_ccppo_policy(base): + class CCPPOTFPolicy(CentralizedValueMixin, base): + def __init__(self, observation_space, action_space, config): + base.__init__(self, observation_space, action_space, config) + CentralizedValueMixin.__init__(self) + + @override(base) + def loss(self, model, dist_class, train_batch): + # Use super() to get to the base PPO policy. + # This special loss function utilizes a shared + # value function defined on self, and the loss function + # defined on PPO policies. + return loss_with_central_critic( + self, super(), model, dist_class, train_batch + ) + + @override(base) + def postprocess_trajectory( + self, sample_batch, other_agent_batches=None, episode=None + ): + return centralized_critic_postprocessing( + self, sample_batch, other_agent_batches, episode + ) + + @override(base) + def stats_fn(self, train_batch: SampleBatch): + stats = super().stats_fn(train_batch) + stats.update(central_vf_stats(self, train_batch)) + return stats + + return CCPPOTFPolicy + + +CCPPOStaticGraphTFPolicy = get_ccppo_policy(PPOTF1Policy) +CCPPOEagerTFPolicy = get_ccppo_policy(PPOTF2Policy) + + +class CCPPOTorchPolicy(CentralizedValueMixin, PPOTorchPolicy): + def __init__(self, observation_space, action_space, config): + PPOTorchPolicy.__init__(self, observation_space, action_space, config) + CentralizedValueMixin.__init__(self) + + @override(PPOTorchPolicy) + def loss(self, model, dist_class, train_batch): + return loss_with_central_critic(self, super(), model, dist_class, train_batch) + + @override(PPOTorchPolicy) + def postprocess_trajectory( + self, sample_batch, other_agent_batches=None, episode=None + ): + return centralized_critic_postprocessing( + self, sample_batch, other_agent_batches, episode + ) + + +class CentralizedCritic(PPO): + @classmethod + @override(PPO) + def get_default_policy_class(cls, config): + if config["framework"] == "torch": + return CCPPOTorchPolicy + elif config["framework"] == "tf": + return CCPPOStaticGraphTFPolicy + else: + return CCPPOEagerTFPolicy + + +if __name__ == "__main__": + ray.init(local_mode=True) + args = parser.parse_args() + + ModelCatalog.register_custom_model( + "cc_model", + TorchCentralizedCriticModel + if args.framework == "torch" + else CentralizedCriticModel, + ) + + config = ( + PPOConfig() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .environment(TwoStepGame) + .framework(args.framework) + .env_runners(batch_mode="complete_episodes", num_env_runners=0) + .training(model={"custom_model": "cc_model"}) + .multi_agent( + policies={ + "pol1": ( + None, + Discrete(6), + TwoStepGame.action_space, + # `framework` would also be ok here. + PPOConfig.overrides(framework_str=args.framework), + ), + "pol2": ( + None, + Discrete(6), + TwoStepGame.action_space, + # `framework` would also be ok here. + PPOConfig.overrides(framework_str=args.framework), + ), + }, + policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "pol1" + if agent_id == 0 + else "pol2", + ) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + tuner = tune.Tuner( + CentralizedCritic, + param_space=config.to_dict(), + run_config=air.RunConfig(stop=stop, verbose=1), + ) + results = tuner.fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py new file mode 100644 index 0000000000000000000000000000000000000000..44745c8722b84553f14e487c4aa7d3047632ff1b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py @@ -0,0 +1,157 @@ +# @OldAPIStack + +""" +Adapted (time-dependent) GAE for PPO algorithm that you can activate by setting +use_adapted_gae=True in the policy config. Additionally, it's required that +"callbacks" include the custom callback class in the Algorithm's config. +Furthermore, the env must return in its info dictionary a key-value pair of +the form "d_ts": ... where the value is the length (time) of recent agent step. + +This adapted, time-dependent computation of advantages may be useful in cases +where agent's actions take various times and thus time steps are not +equidistant (https://docdro.id/400TvlR) +""" + +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.evaluation.postprocessing import Postprocessing +from ray.rllib.utils.annotations import override +import numpy as np + + +class MyCallbacks(RLlibCallback): + @override(RLlibCallback) + def on_postprocess_trajectory( + self, + *, + worker, + episode, + agent_id, + policy_id, + policies, + postprocessed_batch, + original_batches, + **kwargs + ): + super().on_postprocess_trajectory( + worker=worker, + episode=episode, + agent_id=agent_id, + policy_id=policy_id, + policies=policies, + postprocessed_batch=postprocessed_batch, + original_batches=original_batches, + **kwargs + ) + + if policies[policy_id].config.get("use_adapted_gae", False): + policy = policies[policy_id] + assert policy.config[ + "use_gae" + ], "Can't use adapted gae without use_gae=True!" + + info_dicts = postprocessed_batch[SampleBatch.INFOS] + assert np.all( + ["d_ts" in info_dict for info_dict in info_dicts] + ), "Info dicts in sample batch must contain data 'd_ts' \ + (=ts[i+1]-ts[i] length of time steps)!" + + d_ts = np.array( + [np.float(info_dict.get("d_ts")) for info_dict in info_dicts] + ) + assert np.all( + [e.is_integer() for e in d_ts] + ), "Elements of 'd_ts' (length of time steps) must be integer!" + + # Trajectory is actually complete -> last r=0.0. + if postprocessed_batch[SampleBatch.TERMINATEDS][-1]: + last_r = 0.0 + # Trajectory has been truncated -> last r=VF estimate of last obs. + else: + # Input dict is provided to us automatically via the Model's + # requirements. It's a single-timestep (last one in trajectory) + # input_dict. + # Create an input dict according to the Model's requirements. + input_dict = postprocessed_batch.get_single_step_input_dict( + policy.model.view_requirements, index="last" + ) + last_r = policy._value(**input_dict) + + gamma = policy.config["gamma"] + lambda_ = policy.config["lambda"] + + vpred_t = np.concatenate( + [postprocessed_batch[SampleBatch.VF_PREDS], np.array([last_r])] + ) + delta_t = ( + postprocessed_batch[SampleBatch.REWARDS] + + gamma**d_ts * vpred_t[1:] + - vpred_t[:-1] + ) + # This formula for the advantage is an adaption of + # "Generalized Advantage Estimation" + # (https://arxiv.org/abs/1506.02438) which accounts for time steps + # of irregular length (see proposal here ). + # NOTE: last time step delta is not required + postprocessed_batch[ + Postprocessing.ADVANTAGES + ] = generalized_discount_cumsum(delta_t, d_ts[:-1], gamma * lambda_) + postprocessed_batch[Postprocessing.VALUE_TARGETS] = ( + postprocessed_batch[Postprocessing.ADVANTAGES] + + postprocessed_batch[SampleBatch.VF_PREDS] + ).astype(np.float32) + + postprocessed_batch[Postprocessing.ADVANTAGES] = postprocessed_batch[ + Postprocessing.ADVANTAGES + ].astype(np.float32) + + +def generalized_discount_cumsum( + x: np.ndarray, deltas: np.ndarray, gamma: float +) -> np.ndarray: + """Calculates the 'time-dependent' discounted cumulative sum over a + (reward) sequence `x`. + + Recursive equations: + + y[t] - gamma**deltas[t+1]*y[t+1] = x[t] + + reversed(y)[t] - gamma**reversed(deltas)[t-1]*reversed(y)[t-1] = + reversed(x)[t] + + Args: + x (np.ndarray): A sequence of rewards or one-step TD residuals. + deltas (np.ndarray): A sequence of time step deltas (length of time + steps). + gamma: The discount factor gamma. + + Returns: + np.ndarray: The sequence containing the 'time-dependent' discounted + cumulative sums for each individual element in `x` till the end of + the trajectory. + + .. testcode:: + :skipif: True + + x = np.array([0.0, 1.0, 2.0, 3.0]) + deltas = np.array([1.0, 4.0, 15.0]) + gamma = 0.9 + generalized_discount_cumsum(x, deltas, gamma) + + .. testoutput:: + + array([0.0 + 0.9^1.0*1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0, + 1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0, + 2.0 + 0.9^15.0*3.0, + 3.0]) + """ + reversed_x = x[::-1] + reversed_deltas = deltas[::-1] + reversed_y = np.empty_like(x) + reversed_y[0] = reversed_x[0] + for i in range(1, x.size): + reversed_y[i] = ( + reversed_x[i] + gamma ** reversed_deltas[i - 1] * reversed_y[i - 1] + ) + + return reversed_y[::-1] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_in_sequence.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_in_sequence.py new file mode 100644 index 0000000000000000000000000000000000000000..c2872a6e4aca137da03af2fde2341b1d3f40643a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_in_sequence.py @@ -0,0 +1,87 @@ +"""Example of running a multi-agent experiment w/ agents taking turns (sequence). + +This example: + - demonstrates how to write your own (multi-agent) environment using RLlib's + MultiAgentEnv API. + - shows how to implement the `reset()` and `step()` methods of the env such that + the agents act in a fixed sequence (taking turns). + - shows how to configure and setup this environment class within an RLlib + Algorithm config. + - runs the experiment with the configured algo, trying to solve the environment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see results similar to the following in your console output: ++---------------------------+----------+--------+------------------+--------+ +| Trial name | status | iter | total time (s) | ts | +|---------------------------+----------+--------+------------------+--------+ +| PPO_TicTacToe_957aa_00000 | RUNNING | 25 | 96.7452 | 100000 | ++---------------------------+----------+--------+------------------+--------+ ++-------------------+------------------+------------------+ +| combined return | return player2 | return player1 | +|-------------------+------------------+------------------| +| -2 | 1.15 | -0.85 | ++-------------------+------------------+------------------+ + +Note that even though we are playing a zero-sum game, the overall return should start +at some negative values due to the misplacement penalty of our (simplified) TicTacToe +game. +""" +from ray.rllib.examples.envs.classes.multi_agent.tic_tac_toe import TicTacToe +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env # noqa + + +parser = add_rllib_example_script_args( + default_reward=-4.0, default_iters=50, default_timesteps=100000 +) +parser.set_defaults( + enable_new_api_stack=True, + num_agents=2, +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + + # You can also register the env creator function explicitly with: + # register_env("tic_tac_toe", lambda cfg: TicTacToe()) + + # Or allow the RLlib user to set more c'tor options via their algo config: + # config.environment(env_config={[c'tor arg name]: [value]}) + # register_env("tic_tac_toe", lambda cfg: TicTacToe(cfg)) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment(TicTacToe) + .multi_agent( + # Define two policies. + policies={"player1", "player2"}, + # Map agent "player1" to policy "player1" and agent "player2" to policy + # "player2". + policy_mapping_fn=lambda agent_id, episode, **kw: agent_id, + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_simultaneously.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_simultaneously.py new file mode 100644 index 0000000000000000000000000000000000000000..1e9ce55ce29b00c4a53be231083fc2ef645fddf1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/agents_act_simultaneously.py @@ -0,0 +1,108 @@ +"""Example of running a multi-agent experiment w/ agents always acting simultaneously. + +This example: + - demonstrates how to write your own (multi-agent) environment using RLlib's + MultiAgentEnv API. + - shows how to implement the `reset()` and `step()` methods of the env such that + the agents act simultaneously. + - shows how to configure and setup this environment class within an RLlib + Algorithm config. + - runs the experiment with the configured algo, trying to solve the environment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --sheldon-cooper-mode` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see results similar to the following in your console output: + ++-----------------------------------+----------+--------+------------------+-------+ +| Trial name | status | iter | total time (s) | ts | +|-----------------------------------+----------+--------+------------------+-------+ +| PPO_RockPaperScissors_8cef7_00000 | RUNNING | 3 | 16.5348 | 12000 | ++-----------------------------------+----------+--------+------------------+-------+ ++-------------------+------------------+------------------+ +| combined return | return player2 | return player1 | +|-------------------+------------------+------------------| +| 0 | -0.15 | 0.15 | ++-------------------+------------------+------------------+ + +Note that b/c we are playing a zero-sum game, the overall return remains 0.0 at +all times. +""" +from ray.rllib.examples.envs.classes.multi_agent.rock_paper_scissors import ( + RockPaperScissors, +) +from ray.rllib.connectors.env_to_module.flatten_observations import FlattenObservations +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env # noqa + + +parser = add_rllib_example_script_args( + default_reward=0.9, default_iters=50, default_timesteps=100000 +) +parser.set_defaults( + enable_new_api_stack=True, + num_agents=2, +) +parser.add_argument( + "--sheldon-cooper-mode", + action="store_true", + help="Whether to add two more actions to the game: Lizard and Spock. " + "Watch here for more details :) https://www.youtube.com/watch?v=x5Q6-wMx-K8", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + + # You can also register the env creator function explicitly with: + # register_env("env", lambda cfg: RockPaperScissors({"sheldon_cooper_mode": False})) + + # Or you can hard code certain settings into the Env's constructor (`config`). + # register_env( + # "rock-paper-scissors-w-sheldon-mode-activated", + # lambda config: RockPaperScissors({**config, **{"sheldon_cooper_mode": True}}), + # ) + + # Or allow the RLlib user to set more c'tor options via their algo config: + # config.environment(env_config={[c'tor arg name]: [value]}) + # register_env("rock-paper-scissors", lambda cfg: RockPaperScissors(cfg)) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + RockPaperScissors, + env_config={"sheldon_cooper_mode": args.sheldon_cooper_mode}, + ) + .env_runners( + env_to_module_connector=lambda env: FlattenObservations(multi_agent=True), + ) + .multi_agent( + # Define two policies. + policies={"player1", "player2"}, + # Map agent "player1" to policy "player1" and agent "player2" to policy + # "player2". + policy_mapping_fn=lambda agent_id, episode, **kw: agent_id, + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/async_gym_env_vectorization.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/async_gym_env_vectorization.py new file mode 100644 index 0000000000000000000000000000000000000000..06a2d7d0982a3961d5a9c8be09e8c0c925cde288 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/async_gym_env_vectorization.py @@ -0,0 +1,142 @@ +"""Example demo'ing async gym vector envs, in which sub-envs have their own process. + +Setting up env vectorization works through setting the `config.num_envs_per_env_runner` +value to > 1. However, by default the n sub-environments are stepped through +sequentially, rather than in parallel. + +This script shows the effect of setting the `config.gym_env_vectorize_mode` from its +default value of "SYNC" (all sub envs are located in the same EnvRunner process) +to "ASYNC" (all sub envs in each EnvRunner get their own process). + +This example: + - shows, which config settings to change in order to switch from sub-envs being + stepped in sequence to each sub-envs owning its own process (and compute resource) + and thus the vector being stepped in parallel. + - shows, how this setup can increase EnvRunner performance significantly, especially + for heavier, slower environments. + - uses an artificially slow CartPole-v1 environment for demonstration purposes. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack ` + +Use the `--vectorize-mode=BOTH` option to run both modes (SYNC and ASYNC) +through Tune at the same time and get a better comparison of the throughputs +achieved. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see results similar to the following in your console output +when using the + ++--------------------------+------------+------------------------+------+ +| Trial name | status | gym_env_vectorize_mode | iter | +| | | | | +|--------------------------+------------+------------------------+------+ +| PPO_slow-env_6ddf4_00000 | TERMINATED | SYNC | 4 | +| PPO_slow-env_6ddf4_00001 | TERMINATED | ASYNC | 4 | ++--------------------------+------------+------------------------+------+ ++------------------+----------------------+------------------------+ +| total time (s) | episode_return_mean | num_env_steps_sample | +| | | d_lifetime | +|------------------+----------------------+------------------------+ +| 60.8794 | 73.53 | 16040 | +| 19.1203 | 73.86 | 16037 | ++------------------+----------------------+------------------------+ + +You can see that the ASYNC mode, given that the env is sufficiently slow, +achieves much better results when using vectorization. + +You should see no difference, however, when only using +`--num-envs-per-env-runner=1`. +""" +import time + +import gymnasium as gym + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray import tune + +parser = add_rllib_example_script_args(default_reward=60.0) +parser.set_defaults( + enable_new_api_stack=True, + env="CartPole-v1", + num_envs_per_env_runner=6, +) +parser.add_argument( + "--vectorize-mode", + type=str, + default="ASYNC", + help="The value `gym.envs.registration.VectorizeMode` to use for env " + "vectorization. SYNC steps through all sub-envs in sequence. ASYNC (default) " + "parallelizes sub-envs through multiprocessing and can speed up EnvRunners " + "significantly. Use the special value `BOTH` to run both ASYNC and SYNC through a " + "Tune grid-search.", +) + + +class SlowEnv(gym.ObservationWrapper): + def observation(self, observation): + time.sleep(0.005) + return observation + + +if __name__ == "__main__": + args = parser.parse_args() + + if args.no_tune and args.vectorize_mode == "BOTH": + raise ValueError( + "Can't run this script with both --no-tune and --vectorize-mode=BOTH!" + ) + + # Wrap the env with the slowness wrapper. + def _env_creator(cfg): + return SlowEnv(gym.make(args.env, **cfg)) + + tune.register_env("slow-env", _env_creator) + + if args.vectorize_mode == "BOTH" and args.no_tune: + raise ValueError( + "`--vectorize-mode=BOTH` and `--no-tune` not allowed in combination!" + ) + + base_config = ( + PPOConfig() + .environment("slow-env") + .env_runners( + gym_env_vectorize_mode=( + tune.grid_search(["SYNC", "ASYNC"]) + if args.vectorize_mode == "BOTH" + else args.vectorize_mode + ), + ) + ) + + results = run_rllib_example_script_experiment(base_config, args) + + # Compare the throughputs and assert that ASYNC is much faster than SYNC. + if args.vectorize_mode == "BOTH": + throughput_sync = ( + results[0].metrics["num_env_steps_sampled_lifetime"] + / results[0].metrics["time_total_s"] + ) + throughput_async = ( + results[1].metrics["num_env_steps_sampled_lifetime"] + / results[1].metrics["time_total_s"] + ) + assert throughput_async > throughput_sync diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/action_mask_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/action_mask_env.py new file mode 100644 index 0000000000000000000000000000000000000000..7c67db342f72805fff52672ccb767b62a92c3167 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/action_mask_env.py @@ -0,0 +1,42 @@ +from gymnasium.spaces import Box, Dict, Discrete +import numpy as np + +from ray.rllib.examples.envs.classes.random_env import RandomEnv + + +class ActionMaskEnv(RandomEnv): + """A randomly acting environment that publishes an action-mask each step.""" + + def __init__(self, config): + super().__init__(config) + # Masking only works for Discrete actions. + assert isinstance(self.action_space, Discrete) + # Add action_mask to observations. + self.observation_space = Dict( + { + "action_mask": Box(0.0, 1.0, shape=(self.action_space.n,)), + "observations": self.observation_space, + } + ) + self.valid_actions = None + + def reset(self, *, seed=None, options=None): + obs, info = super().reset() + self._fix_action_mask(obs) + return obs, info + + def step(self, action): + # Check whether action is valid. + if not self.valid_actions[action]: + raise ValueError( + f"Invalid action ({action}) sent to env! " + f"valid_actions={self.valid_actions}" + ) + obs, rew, done, truncated, info = super().step(action) + self._fix_action_mask(obs) + return obs, rew, done, truncated, info + + def _fix_action_mask(self, obs): + # Fix action-mask: Everything larger 0.5 is 1.0, everything else 0.0. + self.valid_actions = np.round(obs["action_mask"]) + obs["action_mask"] = self.valid_actions diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_crashing.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_crashing.py new file mode 100644 index 0000000000000000000000000000000000000000..fe5e4f14b4f4dd386242d0105e2fd5b52103bcc8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_crashing.py @@ -0,0 +1,182 @@ +import logging +from gymnasium.envs.classic_control import CartPoleEnv +import numpy as np +import time + +from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent +from ray.rllib.utils.annotations import override +from ray.rllib.utils.error import EnvError + +logger = logging.getLogger(__name__) + + +class CartPoleCrashing(CartPoleEnv): + """A CartPole env that crashes (or stalls) from time to time. + + Useful for testing faulty sub-env (within a vectorized env) handling by + EnvRunners. + + After crashing, the env expects a `reset()` call next (calling `step()` will + result in yet another error), which may or may not take a very long time to + complete. This simulates the env having to reinitialize some sub-processes, e.g. + an external connection. + + The env can also be configured to stall (and do nothing during a call to `step()`) + from time to time for a configurable amount of time. + """ + + def __init__(self, config=None): + super().__init__() + + self.config = config if config is not None else {} + + # Crash probability (in each `step()`). + self.p_crash = config.get("p_crash", 0.005) + # Crash probability when `reset()` is called. + self.p_crash_reset = config.get("p_crash_reset", 0.0) + # Crash exactly after every n steps. If a 2-tuple, will uniformly sample + # crash timesteps from in between the two given values. + self.crash_after_n_steps = config.get("crash_after_n_steps") + self._crash_after_n_steps = None + assert ( + self.crash_after_n_steps is None + or isinstance(self.crash_after_n_steps, int) + or ( + isinstance(self.crash_after_n_steps, tuple) + and len(self.crash_after_n_steps) == 2 + ) + ) + # Only ever crash, if on certain worker indices. + faulty_indices = config.get("crash_on_worker_indices", None) + if faulty_indices and config.worker_index not in faulty_indices: + self.p_crash = 0.0 + self.p_crash_reset = 0.0 + self.crash_after_n_steps = None + + # Stall probability (in each `step()`). + self.p_stall = config.get("p_stall", 0.0) + # Stall probability when `reset()` is called. + self.p_stall_reset = config.get("p_stall_reset", 0.0) + # Stall exactly after every n steps. + self.stall_after_n_steps = config.get("stall_after_n_steps") + self._stall_after_n_steps = None + # Amount of time to stall. If a 2-tuple, will uniformly sample from in between + # the two given values. + self.stall_time_sec = config.get("stall_time_sec") + assert ( + self.stall_time_sec is None + or isinstance(self.stall_time_sec, (int, float)) + or ( + isinstance(self.stall_time_sec, tuple) and len(self.stall_time_sec) == 2 + ) + ) + + # Only ever stall, if on certain worker indices. + faulty_indices = config.get("stall_on_worker_indices", None) + if faulty_indices and config.worker_index not in faulty_indices: + self.p_stall = 0.0 + self.p_stall_reset = 0.0 + self.stall_after_n_steps = None + + # Timestep counter for the ongoing episode. + self.timesteps = 0 + + # Time in seconds to initialize (in this c'tor). + sample = 0.0 + if "init_time_s" in config: + sample = ( + config["init_time_s"] + if not isinstance(config["init_time_s"], tuple) + else np.random.uniform( + config["init_time_s"][0], config["init_time_s"][1] + ) + ) + + print(f"Initializing crashing env (with init-delay of {sample}sec) ...") + time.sleep(sample) + + # Make sure envs don't crash at the same time. + self._rng = np.random.RandomState() + + @override(CartPoleEnv) + def reset(self, *, seed=None, options=None): + # Reset timestep counter for the new episode. + self.timesteps = 0 + self._crash_after_n_steps = None + + # Should we crash? + if self._should_crash(p=self.p_crash_reset): + raise EnvError( + f"Simulated env crash on worker={self.config.worker_index} " + f"env-idx={self.config.vector_index} during `reset()`! " + "Feel free to use any other exception type here instead." + ) + # Should we stall for a while? + self._stall_if_necessary(p=self.p_stall_reset) + + return super().reset() + + @override(CartPoleEnv) + def step(self, action): + # Increase timestep counter for the ongoing episode. + self.timesteps += 1 + + # Should we crash? + if self._should_crash(p=self.p_crash): + raise EnvError( + f"Simulated env crash on worker={self.config.worker_index} " + f"env-idx={self.config.vector_index} during `step()`! " + "Feel free to use any other exception type here instead." + ) + # Should we stall for a while? + self._stall_if_necessary(p=self.p_stall) + + return super().step(action) + + def _should_crash(self, p): + rnd = self._rng.rand() + if rnd < p: + print("Crashing due to p(crash)!") + return True + elif self.crash_after_n_steps is not None: + if self._crash_after_n_steps is None: + self._crash_after_n_steps = ( + self.crash_after_n_steps + if not isinstance(self.crash_after_n_steps, tuple) + else np.random.randint( + self.crash_after_n_steps[0], self.crash_after_n_steps[1] + ) + ) + if self._crash_after_n_steps == self.timesteps: + print("Crashing due to n timesteps reached!") + return True + + return False + + def _stall_if_necessary(self, p): + stall = False + if self._rng.rand() < p: + stall = True + elif self.stall_after_n_steps is not None: + if self._stall_after_n_steps is None: + self._stall_after_n_steps = ( + self.stall_after_n_steps + if not isinstance(self.stall_after_n_steps, tuple) + else np.random.randint( + self.stall_after_n_steps[0], self.stall_after_n_steps[1] + ) + ) + if self._stall_after_n_steps == self.timesteps: + stall = True + + if stall: + sec = ( + self.stall_time_sec + if not isinstance(self.stall_time_sec, tuple) + else np.random.uniform(self.stall_time_sec[0], self.stall_time_sec[1]) + ) + print(f" -> will stall for {sec}sec ...") + time.sleep(sec) + + +MultiAgentCartPoleCrashing = make_multi_agent(lambda config: CartPoleCrashing(config)) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_sparse_rewards.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_sparse_rewards.py new file mode 100644 index 0000000000000000000000000000000000000000..d68f7614c1033ad8c60fe5736e66bdf88f14fc45 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_sparse_rewards.py @@ -0,0 +1,51 @@ +from copy import deepcopy + +import gymnasium as gym +import numpy as np +from gymnasium.spaces import Box, Dict, Discrete + + +class CartPoleSparseRewards(gym.Env): + """Wrapper for gym CartPole environment where reward is accumulated to the end.""" + + def __init__(self, config=None): + self.env = gym.make("CartPole-v1") + self.action_space = Discrete(2) + self.observation_space = Dict( + { + "obs": self.env.observation_space, + "action_mask": Box( + low=0, high=1, shape=(self.action_space.n,), dtype=np.int8 + ), + } + ) + self.running_reward = 0 + + def reset(self, *, seed=None, options=None): + self.running_reward = 0 + obs, infos = self.env.reset() + return { + "obs": obs, + "action_mask": np.array([1, 1], dtype=np.int8), + }, infos + + def step(self, action): + obs, rew, terminated, truncated, info = self.env.step(action) + self.running_reward += rew + score = self.running_reward if terminated else 0 + return ( + {"obs": obs, "action_mask": np.array([1, 1], dtype=np.int8)}, + score, + terminated, + truncated, + info, + ) + + def set_state(self, state): + self.running_reward = state[1] + self.env = deepcopy(state[0]) + obs = np.array(list(self.env.unwrapped.state)) + return {"obs": obs, "action_mask": np.array([1, 1], dtype=np.int8)} + + def get_state(self): + return deepcopy(self.env), self.running_reward diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_dict_observation_space.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_dict_observation_space.py new file mode 100644 index 0000000000000000000000000000000000000000..e334f09296da31ecf96fb044cf52891a13f3d11b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_dict_observation_space.py @@ -0,0 +1,74 @@ +import gymnasium as gym +from gymnasium.envs.classic_control import CartPoleEnv +import numpy as np + + +class CartPoleWithDictObservationSpace(CartPoleEnv): + """CartPole gym environment that has a dict observation space. + + However, otherwise, the information content in each observation remains the same. + + https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/classic_control/cartpole.py # noqa + + The new observation space looks as follows (a little quirky, but this is + for testing purposes only): + + gym.spaces.Dict({ + "x-pos": [x-pos], + "angular-pos": gym.spaces.Dict({"test": [angular-pos]}), + "velocs": gym.spaces.Tuple([x-veloc, angular-veloc]), + }) + """ + + def __init__(self, config=None): + super().__init__() + + # Fix our observation-space as described above. + low = self.observation_space.low + high = self.observation_space.high + + # Test as many quirks and oddities as possible: Dict, Dict inside a Dict, + # Tuple inside a Dict, and both (1,)-shapes as well as ()-shapes for Boxes. + # Also add a random discrete variable here. + self.observation_space = gym.spaces.Dict( + { + "x-pos": gym.spaces.Box(low[0], high[0], (1,), dtype=np.float32), + "angular-pos": gym.spaces.Dict( + { + "value": gym.spaces.Box(low[2], high[2], (), dtype=np.float32), + # Add some random non-essential information. + "some_random_stuff": gym.spaces.Discrete(3), + } + ), + "velocs": gym.spaces.Tuple( + [ + # x-veloc + gym.spaces.Box(low[1], high[1], (1,), dtype=np.float32), + # angular-veloc + gym.spaces.Box(low[3], high[3], (), dtype=np.float32), + ] + ), + } + ) + + def step(self, action): + next_obs, reward, done, truncated, info = super().step(action) + return self._compile_current_obs(next_obs), reward, done, truncated, info + + def reset(self, *, seed=None, options=None): + init_obs, init_info = super().reset(seed=seed, options=options) + return self._compile_current_obs(init_obs), init_info + + def _compile_current_obs(self, original_cartpole_obs): + # original_cartpole_obs is [x-pos, x-veloc, angle, angle-veloc] + return { + "x-pos": np.array([original_cartpole_obs[0]], np.float32), + "angular-pos": { + "value": original_cartpole_obs[2], + "some_random_stuff": np.random.randint(3), + }, + "velocs": ( + np.array([original_cartpole_obs[1]], np.float32), + np.array(original_cartpole_obs[3], np.float32), + ), + } diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_large_observation_space.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_large_observation_space.py new file mode 100644 index 0000000000000000000000000000000000000000..162db205658bb9bb78426300b23a6d90913fc89f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_large_observation_space.py @@ -0,0 +1,69 @@ +import gymnasium as gym +from gymnasium.envs.classic_control import CartPoleEnv +import numpy as np + + +class CartPoleWithLargeObservationSpace(CartPoleEnv): + """CartPole gym environment that has a large dict observation space. + + However, otherwise, the information content in each observation remains the same. + + https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/classic_control/cartpole.py # noqa + + The new observation space looks as follows (a little quirky, but this is + for testing purposes only): + + gym.spaces.Dict({ + "1": gym.spaces.Tuple(( + gym.spaces.Discrete(100), + gym.spaces.Box(0, 256, shape=(30,), dtype=float32), + )), + "2": gym.spaces.Tuple(( + gym.spaces.Discrete(100), + gym.spaces.Box(0, 256, shape=(30,), dtype=float32), + )), + "3": ... + "actual-obs": gym.spaces.Box(-inf, inf, (4,), float32), + }) + """ + + def __init__(self, config=None): + super().__init__() + + # Fix our observation-space as described above. + low = self.observation_space.low + high = self.observation_space.high + + # Test as many quirks and oddities as possible: Dict, Dict inside a Dict, + # Tuple inside a Dict, and both (1,)-shapes as well as ()-shapes for Boxes. + # Also add a random discrete variable here. + spaces = { + str(i): gym.spaces.Tuple( + ( + gym.spaces.Discrete(100), + gym.spaces.Box(0, 256, shape=(30,), dtype=np.float32), + ) + ) + for i in range(100) + } + spaces.update( + { + "actually-useful-stuff": ( + gym.spaces.Box(low[0], high[0], (4,), np.float32) + ) + } + ) + self.observation_space = gym.spaces.Dict(spaces) + + def step(self, action): + next_obs, reward, done, truncated, info = super().step(action) + return self._compile_current_obs(next_obs), reward, done, truncated, info + + def reset(self, *, seed=None, options=None): + init_obs, init_info = super().reset(seed=seed, options=options) + return self._compile_current_obs(init_obs), init_info + + def _compile_current_obs(self, original_cartpole_obs): + return { + str(i): self.observation_space.spaces[str(i)].sample() for i in range(100) + } | {"actually-useful-stuff": original_cartpole_obs} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_protobuf_observation_space.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_protobuf_observation_space.py new file mode 100644 index 0000000000000000000000000000000000000000..f88b802d37a08ddf2c0972ace4eba8e8315521ea --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cartpole_with_protobuf_observation_space.py @@ -0,0 +1,79 @@ +import gymnasium as gym +from gymnasium.envs.classic_control import CartPoleEnv +import numpy as np + +from ray.rllib.examples.envs.classes.utils.cartpole_observations_proto import ( + CartPoleObservation, +) + + +class CartPoleWithProtobufObservationSpace(CartPoleEnv): + """CartPole gym environment that has a protobuf observation space. + + Sometimes, it is more performant for an environment to publish its observations + as a protobuf message (instead of a heavily nested Dict). + + The protobuf message used here is originally defined in the + `./utils/cartpole_observations.proto` file. We converted this file into a python + importable module by compiling it with: + + `protoc --python_out=. cartpole_observations.proto` + + .. which yielded the `cartpole_observations_proto.py` file in the same directory + (we import this file's `CartPoleObservation` message here). + + The new observation space is a (binary) Box(0, 255, ([len of protobuf],), uint8). + + A ConnectorV2 pipeline or simpler gym.Wrapper will have to be used to convert this + observation format into an NN-readable (e.g. float32) 1D tensor. + """ + + def __init__(self, config=None): + super().__init__() + dummy_obs = self._convert_observation_to_protobuf( + np.array([1.0, 1.0, 1.0, 1.0]) + ) + bin_length = len(dummy_obs) + self.observation_space = gym.spaces.Box(0, 255, (bin_length,), np.uint8) + + def step(self, action): + observation, reward, terminated, truncated, info = super().step(action) + proto_observation = self._convert_observation_to_protobuf(observation) + return proto_observation, reward, terminated, truncated, info + + def reset(self, **kwargs): + observation, info = super().reset(**kwargs) + proto_observation = self._convert_observation_to_protobuf(observation) + return proto_observation, info + + def _convert_observation_to_protobuf(self, observation): + x_pos, x_veloc, angle_pos, angle_veloc = observation + + # Create the Protobuf message + cartpole_observation = CartPoleObservation() + cartpole_observation.x_pos = x_pos + cartpole_observation.x_veloc = x_veloc + cartpole_observation.angle_pos = angle_pos + cartpole_observation.angle_veloc = angle_veloc + + # Serialize to binary string. + return np.frombuffer(cartpole_observation.SerializeToString(), np.uint8) + + +if __name__ == "__main__": + env = CartPoleWithProtobufObservationSpace() + obs, info = env.reset() + + # Test loading a protobuf object with data from the obs binary string + # (uint8 ndarray). + byte_str = obs.tobytes() + obs_protobuf = CartPoleObservation() + obs_protobuf.ParseFromString(byte_str) + print(obs_protobuf) + + terminated = truncated = False + while not terminated and not truncated: + action = env.action_space.sample() + obs, reward, terminated, truncated, info = env.step(action) + + print(obs) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cliff_walking_wall_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cliff_walking_wall_env.py new file mode 100644 index 0000000000000000000000000000000000000000..496e86e3f90782a6227c570c157e74c3c03741a6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/cliff_walking_wall_env.py @@ -0,0 +1,71 @@ +import gymnasium as gym +from gymnasium import spaces + +ACTION_UP = 0 +ACTION_RIGHT = 1 +ACTION_DOWN = 2 +ACTION_LEFT = 3 + + +class CliffWalkingWallEnv(gym.Env): + """Modified version of the CliffWalking environment from Farama-Foundation's + Gymnasium with walls instead of a cliff. + + ### Description + The board is a 4x12 matrix, with (using NumPy matrix indexing): + - [3, 0] or obs==36 as the start at bottom-left + - [3, 11] or obs==47 as the goal at bottom-right + - [3, 1..10] or obs==37...46 as the cliff at bottom-center + + An episode terminates when the agent reaches the goal. + + ### Actions + There are 4 discrete deterministic actions: + - 0: move up + - 1: move right + - 2: move down + - 3: move left + You can also use the constants ACTION_UP, ACTION_RIGHT, ... defined above. + + ### Observations + There are 3x12 + 2 possible states, not including the walls. If an action + would move an agent into one of the walls, it simply stays in the same position. + + ### Reward + Each time step incurs -1 reward, except reaching the goal which gives +10 reward. + """ + + def __init__(self, seed=42) -> None: + self.observation_space = spaces.Discrete(48) + self.action_space = spaces.Discrete(4) + self.observation_space.seed(seed) + self.action_space.seed(seed) + + def reset(self, *, seed=None, options=None): + self.position = 36 + return self.position, {} + + def step(self, action): + x = self.position // 12 + y = self.position % 12 + # UP + if action == ACTION_UP: + x = max(x - 1, 0) + # RIGHT + elif action == ACTION_RIGHT: + if self.position != 36: + y = min(y + 1, 11) + # DOWN + elif action == ACTION_DOWN: + if self.position < 25 or self.position > 34: + x = min(x + 1, 3) + # LEFT + elif action == ACTION_LEFT: + if self.position != 47: + y = max(y - 1, 0) + else: + raise ValueError(f"action {action} not in {self.action_space}") + self.position = x * 12 + y + done = self.position == 47 + reward = -1 if not done else 10 + return self.position, reward, done, False, {} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/correlated_actions_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/correlated_actions_env.py new file mode 100644 index 0000000000000000000000000000000000000000..8b0bdb882fc0bb05fe801ed4999fa8a1a74f5585 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/correlated_actions_env.py @@ -0,0 +1,79 @@ +from typing import Any, Dict, Optional + +import gymnasium as gym +import numpy as np + + +class CorrelatedActionsEnv(gym.Env): + """Environment that can only be solved through an autoregressive action model. + + In each step, the agent observes a random number (between -1 and 1) and has + to choose two actions, a1 (discrete, 0, 1, or 2) and a2 (cont. between -1 and 1). + + The reward is constructed such that actions need to be correlated to succeed. It's + impossible for the network to learn each action head separately. + + There are two reward components: + The first is the negative absolute value of the delta between 1.0 and the sum of + obs + a1. For example, if obs is -0.3 and a1 was sampled to be 1, then the value of + the first reward component is: + r1 = -abs(1.0 - [obs+a1]) = -abs(1.0 - (-0.3 + 1)) = -abs(0.3) = -0.3 + The second reward component is computed as the negative absolute value + of `obs + a1 + a2`. For example, if obs is 0.5, a1 was sampled to be 0, + and a2 was sampled to be -0.7, then the value of the second reward component is: + r2 = -abs(obs + a1 + a2) = -abs(0.5 + 0 - 0.7)) = -abs(-0.2) = -0.2 + + Because of this specific reward function, the agent must learn to optimally sample + a1 based on the observation and to optimally sample a2, based on the observation + AND the sampled value of a1. + + One way to effectively learn this is through correlated action + distributions, e.g., in examples/actions/auto_regressive_actions.py + + The game ends after the first step. + """ + + def __init__(self, config=None): + super().__init__() + # Observation space (single continuous value between -1. and 1.). + self.observation_space = gym.spaces.Box(-1.0, 1.0, shape=(1,), dtype=np.float32) + + # Action space (discrete action a1 and continuous action a2). + self.action_space = gym.spaces.Tuple( + [gym.spaces.Discrete(3), gym.spaces.Box(-2.0, 2.0, (1,), np.float32)] + ) + + # Internal state for the environment (e.g., could represent a factor + # influencing the relationship) + self.obs = None + + def reset( + self, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None + ): + """Reset the environment to an initial state.""" + super().reset(seed=seed, options=options) + + # Randomly initialize the observation between -1 and 1. + self.obs = np.random.uniform(-1, 1, size=(1,)) + + return self.obs, {} + + def step(self, action): + """Apply the autoregressive action and return step information.""" + + # Extract individual action components, a1 and a2. + a1, a2 = action + a2 = a2[0] # dissolve shape=(1,) + + # r1 depends on how well a1 is aligned to obs: + r1 = -abs(1.0 - (self.obs[0] + a1)) + # r2 depends on how well a2 is aligned to both, obs and a1. + r2 = -abs(self.obs[0] + a1 + a2) + + reward = r1 + r2 + + # Optionally: add some noise or complexity to the reward function + # reward += np.random.normal(0, 0.01) # Small noise can be added + + # Terminate after each step (no episode length in this simple example) + return self.obs, reward, True, False, {} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/d4rl_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/d4rl_env.py new file mode 100644 index 0000000000000000000000000000000000000000..f77434589b92b05635e40907d2686c5bd8e82ee3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/d4rl_env.py @@ -0,0 +1,46 @@ +""" +8 Environments from D4RL Environment. +Use fully qualified class-path in your configs: +e.g. "env": "ray.rllib.examples.envs.classes.d4rl_env.halfcheetah_random". +""" + +import gymnasium as gym + +try: + import d4rl + + d4rl.__name__ # Fool LINTer. +except ImportError: + d4rl = None + + +def halfcheetah_random(): + return gym.make("halfcheetah-random-v0") + + +def halfcheetah_medium(): + return gym.make("halfcheetah-medium-v0") + + +def halfcheetah_expert(): + return gym.make("halfcheetah-expert-v0") + + +def halfcheetah_medium_replay(): + return gym.make("halfcheetah-medium-replay-v0") + + +def hopper_random(): + return gym.make("hopper-random-v0") + + +def hopper_medium(): + return gym.make("hopper-medium-v0") + + +def hopper_expert(): + return gym.make("hopper-expert-v0") + + +def hopper_medium_replay(): + return gym.make("hopper-medium-replay-v0") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/debug_counter_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/debug_counter_env.py new file mode 100644 index 0000000000000000000000000000000000000000..404833e18d8d81bb6cfd3b7318a80233524262c5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/debug_counter_env.py @@ -0,0 +1,92 @@ +import gymnasium as gym +import numpy as np + +from ray.rllib.env.multi_agent_env import MultiAgentEnv + + +class DebugCounterEnv(gym.Env): + """Simple Env that yields a ts counter as observation (0-based). + + Actions have no effect. + The episode length is always 15. + Reward is always: current ts % 3. + """ + + def __init__(self, config=None): + config = config or {} + self.action_space = gym.spaces.Discrete(2) + self.observation_space = gym.spaces.Box(0, 100, (1,), dtype=np.float32) + self.start_at_t = int(config.get("start_at_t", 0)) + self.i = self.start_at_t + + def reset(self, *, seed=None, options=None): + self.i = self.start_at_t + return self._get_obs(), {} + + def step(self, action): + self.i += 1 + terminated = False + truncated = self.i >= 15 + self.start_at_t + return self._get_obs(), float(self.i % 3), terminated, truncated, {} + + def _get_obs(self): + return np.array([self.i], dtype=np.float32) + + +class MultiAgentDebugCounterEnv(MultiAgentEnv): + def __init__(self, config): + super().__init__() + self.num_agents = config["num_agents"] + self.base_episode_len = config.get("base_episode_len", 103) + + # Observation dims: + # 0=agent ID. + # 1=episode ID (0.0 for obs after reset). + # 2=env ID (0.0 for obs after reset). + # 3=ts (of the agent). + self.observation_space = gym.spaces.Dict( + { + aid: gym.spaces.Box(float("-inf"), float("inf"), (4,)) + for aid in range(self.num_agents) + } + ) + + # Actions are always: + # (episodeID, envID) as floats. + self.action_space = gym.spaces.Dict( + { + aid: gym.spaces.Box(-float("inf"), float("inf"), shape=(2,)) + for aid in range(self.num_agents) + } + ) + + self.timesteps = [0] * self.num_agents + self.terminateds = set() + self.truncateds = set() + + def reset(self, *, seed=None, options=None): + self.timesteps = [0] * self.num_agents + self.terminateds = set() + self.truncateds = set() + return { + i: np.array([i, 0.0, 0.0, 0.0], dtype=np.float32) + for i in range(self.num_agents) + }, {} + + def step(self, action_dict): + obs, rew, terminated, truncated = {}, {}, {}, {} + for i, action in action_dict.items(): + self.timesteps[i] += 1 + obs[i] = np.array([i, action[0], action[1], self.timesteps[i]]) + rew[i] = self.timesteps[i] % 3 + terminated[i] = False + truncated[i] = ( + True if self.timesteps[i] > self.base_episode_len + i else False + ) + if terminated[i]: + self.terminateds.add(i) + if truncated[i]: + self.truncateds.add(i) + terminated["__all__"] = len(self.terminateds) == self.num_agents + truncated["__all__"] = len(self.truncateds) == self.num_agents + return obs, rew, terminated, truncated, {} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/deterministic_envs.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/deterministic_envs.py new file mode 100644 index 0000000000000000000000000000000000000000..51f41f29fb3b0981f4d97106b678eec4e230eccc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/deterministic_envs.py @@ -0,0 +1,13 @@ +import gymnasium as gym + + +def create_cartpole_deterministic(config): + env = gym.make("CartPole-v1") + env.reset(seed=config.get("seed", 0)) + return env + + +def create_pendulum_deterministic(config): + env = gym.make("Pendulum-v1") + env.reset(seed=config.get("seed", 0)) + return env diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/dm_control_suite.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/dm_control_suite.py new file mode 100644 index 0000000000000000000000000000000000000000..8a1425fac214b8a28d4e169a363a8f5dfcaa9dc5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/dm_control_suite.py @@ -0,0 +1,131 @@ +from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv + +""" +8 Environments from Deepmind Control Suite +""" + + +def acrobot_swingup( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "acrobot", + "swingup", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) + + +def walker_walk( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "walker", + "walk", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) + + +def hopper_hop( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "hopper", + "hop", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) + + +def hopper_stand( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "hopper", + "stand", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) + + +def cheetah_run( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "cheetah", + "run", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) + + +def walker_run( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "walker", + "run", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) + + +def pendulum_swingup( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "pendulum", + "swingup", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) + + +def cartpole_swingup( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "cartpole", + "swingup", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) + + +def humanoid_walk( + from_pixels=True, height=64, width=64, frame_skip=2, channels_first=True +): + return DMCEnv( + "humanoid", + "walk", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_using_remote_actor.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_using_remote_actor.py new file mode 100644 index 0000000000000000000000000000000000000000..b52242c9679abbad559db9e586310e93f9bb8928 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_using_remote_actor.py @@ -0,0 +1,63 @@ +""" +Example of an environment that uses a named remote actor as parameter +server. + +""" +from gymnasium.envs.classic_control.cartpole import CartPoleEnv +from gymnasium.utils import seeding + +import ray + + +@ray.remote +class ParameterStorage: + def get_params(self, rng): + return { + "MASSCART": rng.uniform(low=0.5, high=2.0), + } + + +class CartPoleWithRemoteParamServer(CartPoleEnv): + """CartPoleMassEnv varies the weights of the cart and the pole.""" + + def __init__(self, env_config): + self.env_config = env_config + super().__init__() + # Get our param server (remote actor) by name. + self._handler = ray.get_actor(env_config.get("param_server", "param-server")) + self.rng_seed = None + self.np_random, _ = seeding.np_random(self.rng_seed) + + def reset(self, *, seed=None, options=None): + if seed is not None: + self.rng_seed = int(seed) + self.np_random, _ = seeding.np_random(seed) + print( + f"Seeding env (worker={self.env_config.worker_index}) " f"with {seed}" + ) + + # Pass in our RNG to guarantee no race conditions. + # If `self._handler` had its own RNG, this may clash with other + # envs trying to use the same param-server. + params = ray.get(self._handler.get_params.remote(self.np_random)) + + # IMPORTANT: Advance the state of our RNG (self._rng was passed + # above via ray (serialized) and thus not altered locally here!). + # Or create a new RNG from another random number: + # Seed the RNG with a deterministic seed if set, otherwise, create + # a random one. + new_seed = int( + self.np_random.integers(0, 1000000) if not self.rng_seed else self.rng_seed + ) + self.np_random, _ = seeding.np_random(new_seed) + + print( + f"Env worker-idx={self.env_config.worker_index} " + f"mass={params['MASSCART']}" + ) + + self.masscart = params["MASSCART"] + self.total_mass = self.masspole + self.masscart + self.polemass_length = self.masspole * self.length + + return super().reset() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_with_subprocess.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_with_subprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..424f1eb095074a834f39d9159581dfddb190996f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/env_with_subprocess.py @@ -0,0 +1,42 @@ +import atexit +import gymnasium as gym +from gymnasium.spaces import Discrete +import os +import subprocess + + +class EnvWithSubprocess(gym.Env): + """An env that spawns a subprocess.""" + + # Dummy command to run as a subprocess with a unique name + UNIQUE_CMD = "sleep 20" + + def __init__(self, config): + self.UNIQUE_FILE_0 = config["tmp_file1"] + self.UNIQUE_FILE_1 = config["tmp_file2"] + self.UNIQUE_FILE_2 = config["tmp_file3"] + self.UNIQUE_FILE_3 = config["tmp_file4"] + + self.action_space = Discrete(2) + self.observation_space = Discrete(2) + # Subprocess that should be cleaned up. + self.subproc = subprocess.Popen(self.UNIQUE_CMD.split(" "), shell=False) + self.config = config + # Exit handler should be called. + atexit.register(lambda: self.subproc.kill()) + if config.worker_index == 0: + atexit.register(lambda: os.unlink(self.UNIQUE_FILE_0)) + else: + atexit.register(lambda: os.unlink(self.UNIQUE_FILE_1)) + + def close(self): + if self.config.worker_index == 0: + os.unlink(self.UNIQUE_FILE_2) + else: + os.unlink(self.UNIQUE_FILE_3) + + def reset(self, *, seed=None, options=None): + return 0, {} + + def step(self, action): + return 0, 0, True, False, {} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/fast_image_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/fast_image_env.py new file mode 100644 index 0000000000000000000000000000000000000000..1eaad9a8fe81966ad42971b598bfcf6d125c11c7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/fast_image_env.py @@ -0,0 +1,20 @@ +import gymnasium as gym +from gymnasium.spaces import Box, Discrete +import numpy as np + + +class FastImageEnv(gym.Env): + def __init__(self, config): + self.zeros = np.zeros((84, 84, 4)) + self.action_space = Discrete(2) + self.observation_space = Box(0.0, 1.0, shape=(84, 84, 4), dtype=np.float32) + self.i = 0 + + def reset(self, *, seed=None, options=None): + self.i = 0 + return self.zeros, {} + + def step(self, action): + self.i += 1 + done = truncated = self.i > 1000 + return self.zeros, 1, done, truncated, {} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/gpu_requiring_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/gpu_requiring_env.py new file mode 100644 index 0000000000000000000000000000000000000000..e8e08f3489a901dd6b4cc3612c0e520c5684462f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/gpu_requiring_env.py @@ -0,0 +1,37 @@ +import numpy as np + +import ray +from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor +from ray.rllib.utils.framework import try_import_torch + +torch, _ = try_import_torch() + + +class GPURequiringEnv(SimpleCorridor): + """A dummy env that requires a GPU in order to work. + + The env here is a simple corridor env that additionally simulates a GPU + check in its constructor via `ray.get_gpu_ids()`. If this returns an + empty list, we raise an error. + + To make this env work, use `num_gpus_per_env_runner > 0` (RolloutWorkers + requesting this many GPUs each) and - maybe - `num_gpus > 0` in case + your local worker/driver must have an env as well. However, this is + only the case if `create_env_on_driver`=True (default is False). + """ + + def __init__(self, config=None): + super().__init__(config) + + # Fake-require some GPUs (at least one). + # If your local worker's env (`create_env_on_driver`=True) does not + # necessarily require a GPU, you can perform the below assertion only + # if `config.worker_index != 0`. + gpus_available = ray.get_gpu_ids() + print(f"{type(self).__name__} can see GPUs={gpus_available}") + + # Create a dummy tensor on the GPU. + if len(gpus_available) > 0 and torch: + self._tensor = torch.from_numpy(np.random.random_sample(size=(42, 42))).to( + f"cuda:{gpus_available[0]}" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/look_and_push.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/look_and_push.py new file mode 100644 index 0000000000000000000000000000000000000000..46c77b620e8a472499bf23a32c8da178c49a929e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/look_and_push.py @@ -0,0 +1,65 @@ +import gymnasium as gym +import numpy as np + + +class LookAndPush(gym.Env): + """Memory-requiring Env: Best sequence of actions depends on prev. states. + + Optimal behavior: + 0) a=0 -> observe next state (s'), which is the "hidden" state. + If a=1 here, the hidden state is not observed. + 1) a=1 to always jump to s=2 (not matter what the prev. state was). + 2) a=1 to move to s=3. + 3) a=1 to move to s=4. + 4) a=0 OR 1 depending on s' observed after 0): +10 reward and done. + otherwise: -10 reward and done. + """ + + def __init__(self): + self.action_space = gym.spaces.Discrete(2) + self.observation_space = gym.spaces.Discrete(5) + self._state = None + self._case = None + + def reset(self, *, seed=None, options=None): + self._state = 2 + self._case = np.random.choice(2) + return self._state, {} + + def step(self, action): + assert self.action_space.contains(action) + + if self._state == 4: + if action and self._case: + return self._state, 10.0, True, {} + else: + return self._state, -10, True, {} + else: + if action: + if self._state == 0: + self._state = 2 + else: + self._state += 1 + elif self._state == 2: + self._state = self._case + + return self._state, -1, False, False, {} + + +class OneHot(gym.Wrapper): + def __init__(self, env): + super(OneHot, self).__init__(env) + self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n,)) + + def reset(self, *, seed=None, options=None): + obs, info = self.env.reset(seed=seed, options=options) + return self._encode_obs(obs), info + + def step(self, action): + obs, reward, terminated, truncated, info = self.env.step(action) + return self._encode_obs(obs), reward, terminated, truncated, info + + def _encode_obs(self, obs): + new_obs = np.ones(self.env.observation_space.n) + new_obs[obs] = 1.0 + return new_obs diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/memory_leaking_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/memory_leaking_env.py new file mode 100644 index 0000000000000000000000000000000000000000..c97edb296691f14a9566c7e6645d47049410cd9f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/memory_leaking_env.py @@ -0,0 +1,35 @@ +import logging +import uuid + +from ray.rllib.examples.envs.classes.random_env import RandomEnv +from ray.rllib.utils.annotations import override + +logger = logging.getLogger(__name__) + + +class MemoryLeakingEnv(RandomEnv): + """An env that leaks very little memory. + + Useful for proving that our memory-leak tests can catch the + slightest leaks. + """ + + def __init__(self, config=None): + super().__init__(config) + self._leak = {} + self._steps_after_reset = 0 + + @override(RandomEnv) + def reset(self, *, seed=None, options=None): + self._steps_after_reset = 0 + return super().reset(seed=seed, options=options) + + @override(RandomEnv) + def step(self, action): + self._steps_after_reset += 1 + + # Only leak once an episode. + if self._steps_after_reset == 2: + self._leak[uuid.uuid4().hex.upper()] = 1 + + return super().step(action) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/mock_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/mock_env.py new file mode 100644 index 0000000000000000000000000000000000000000..85d8b26935c3a1aa831ad912316f81c0246c7747 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/mock_env.py @@ -0,0 +1,220 @@ +import gymnasium as gym +import numpy as np +from typing import Optional + +from ray.rllib.env.vector_env import VectorEnv +from ray.rllib.utils.annotations import override + + +class MockEnv(gym.Env): + """Mock environment for testing purposes. + + Observation=0, reward=1.0, episode-len is configurable. + Actions are ignored. + """ + + def __init__(self, episode_length, config=None): + self.episode_length = episode_length + self.config = config + self.i = 0 + self.observation_space = gym.spaces.Discrete(1) + self.action_space = gym.spaces.Discrete(2) + + def reset(self, *, seed=None, options=None): + self.i = 0 + return 0, {} + + def step(self, action): + self.i += 1 + terminated = truncated = self.i >= self.episode_length + return 0, 1.0, terminated, truncated, {} + + +class MockEnv2(gym.Env): + """Mock environment for testing purposes. + + Observation=ts (discrete space!), reward=100.0, episode-len is + configurable. Actions are ignored. + """ + + metadata = { + "render.modes": ["rgb_array"], + } + render_mode: Optional[str] = "rgb_array" + + def __init__(self, episode_length): + self.episode_length = episode_length + self.i = 0 + self.observation_space = gym.spaces.Discrete(self.episode_length + 1) + self.action_space = gym.spaces.Discrete(2) + self.rng_seed = None + + def reset(self, *, seed=None, options=None): + self.i = 0 + if seed is not None: + self.rng_seed = seed + return self.i, {} + + def step(self, action): + self.i += 1 + terminated = truncated = self.i >= self.episode_length + return self.i, 100.0, terminated, truncated, {} + + def render(self): + # Just generate a random image here for demonstration purposes. + # Also see `gym/envs/classic_control/cartpole.py` for + # an example on how to use a Viewer object. + return np.random.randint(0, 256, size=(300, 400, 3), dtype=np.uint8) + + +class MockEnv3(gym.Env): + """Mock environment for testing purposes. + + Observation=ts (discrete space!), reward=100.0, episode-len is + configurable. Actions are ignored. + """ + + def __init__(self, episode_length): + self.episode_length = episode_length + self.i = 0 + self.observation_space = gym.spaces.Discrete(100) + self.action_space = gym.spaces.Discrete(2) + + def reset(self, *, seed=None, options=None): + self.i = 0 + return self.i, {"timestep": 0} + + def step(self, action): + self.i += 1 + terminated = truncated = self.i >= self.episode_length + return self.i, self.i, terminated, truncated, {"timestep": self.i} + + +class VectorizedMockEnv(VectorEnv): + """Vectorized version of the MockEnv. + + Contains `num_envs` MockEnv instances, each one having its own + `episode_length` horizon. + """ + + def __init__(self, episode_length, num_envs): + super().__init__( + observation_space=gym.spaces.Discrete(1), + action_space=gym.spaces.Discrete(2), + num_envs=num_envs, + ) + self.envs = [MockEnv(episode_length) for _ in range(num_envs)] + + @override(VectorEnv) + def vector_reset(self, *, seeds=None, options=None): + seeds = seeds or [None] * self.num_envs + options = options or [None] * self.num_envs + obs_and_infos = [ + e.reset(seed=seeds[i], options=options[i]) for i, e in enumerate(self.envs) + ] + return [oi[0] for oi in obs_and_infos], [oi[1] for oi in obs_and_infos] + + @override(VectorEnv) + def reset_at(self, index, *, seed=None, options=None): + return self.envs[index].reset(seed=seed, options=options) + + @override(VectorEnv) + def vector_step(self, actions): + obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch = ( + [], + [], + [], + [], + [], + ) + for i in range(len(self.envs)): + obs, rew, terminated, truncated, info = self.envs[i].step(actions[i]) + obs_batch.append(obs) + rew_batch.append(rew) + terminated_batch.append(terminated) + truncated_batch.append(truncated) + info_batch.append(info) + return obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch + + @override(VectorEnv) + def get_sub_environments(self): + return self.envs + + +class MockVectorEnv(VectorEnv): + """A custom vector env that uses a single(!) CartPole sub-env. + + However, this env pretends to be a vectorized one to illustrate how one + could create custom VectorEnvs w/o the need for actual vectorizations of + sub-envs under the hood. + """ + + def __init__(self, episode_length, mocked_num_envs): + self.env = gym.make("CartPole-v1") + super().__init__( + observation_space=self.env.observation_space, + action_space=self.env.action_space, + num_envs=mocked_num_envs, + ) + self.episode_len = episode_length + self.ts = 0 + + @override(VectorEnv) + def vector_reset(self, *, seeds=None, options=None): + # Since we only have one underlying sub-environment, just use the first seed + # and the first options dict (the user of this env thinks, there are + # `self.num_envs` sub-environments and sends that many seeds/options). + seeds = seeds or [None] + options = options or [None] + obs, infos = self.env.reset(seed=seeds[0], options=options[0]) + # Simply repeat the single obs/infos to pretend we really have + # `self.num_envs` sub-environments. + return ( + [obs for _ in range(self.num_envs)], + [infos for _ in range(self.num_envs)], + ) + + @override(VectorEnv) + def reset_at(self, index, *, seed=None, options=None): + self.ts = 0 + return self.env.reset(seed=seed, options=options) + + @override(VectorEnv) + def vector_step(self, actions): + self.ts += 1 + # Apply all actions sequentially to the same env. + # Whether this would make a lot of sense is debatable. + obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch = ( + [], + [], + [], + [], + [], + ) + for i in range(self.num_envs): + obs, rew, terminated, truncated, info = self.env.step(actions[i]) + # Artificially truncate once time step limit has been reached. + # Note: Also terminate/truncate, when underlying CartPole is + # terminated/truncated. + if self.ts >= self.episode_len: + truncated = True + obs_batch.append(obs) + rew_batch.append(rew) + terminated_batch.append(terminated) + truncated_batch.append(truncated) + info_batch.append(info) + if terminated or truncated: + remaining = self.num_envs - (i + 1) + obs_batch.extend([obs for _ in range(remaining)]) + rew_batch.extend([rew for _ in range(remaining)]) + terminated_batch.extend([terminated for _ in range(remaining)]) + truncated_batch.extend([truncated for _ in range(remaining)]) + info_batch.extend([info for _ in range(remaining)]) + break + return obs_batch, rew_batch, terminated_batch, truncated_batch, info_batch + + @override(VectorEnv) + def get_sub_environments(self): + # You may also leave this method as-is, in which case, it would + # return an empty list. + return [self.env for _ in range(self.num_envs)] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_discrete.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_discrete.py new file mode 100644 index 0000000000000000000000000000000000000000..e7ceb11ebfc39e1df470fa13d76d0f1c5058460a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/bandit_envs_discrete.py @@ -0,0 +1,206 @@ +import copy +import gymnasium as gym +from gymnasium.spaces import Box, Discrete +import numpy as np +import random + + +class SimpleContextualBandit(gym.Env): + """Simple env w/ 2 states and 3 actions (arms): 0, 1, and 2. + + Episodes last only for one timestep, possible observations are: + [-1.0, 1.0] and [1.0, -1.0], where the first element is the "current context". + The highest reward (+10.0) is received for selecting arm 0 for context=1.0 + and arm 2 for context=-1.0. Action 1 always yields 0.0 reward. + """ + + def __init__(self, config=None): + self.action_space = Discrete(3) + self.observation_space = Box(low=-1.0, high=1.0, shape=(2,)) + self.cur_context = None + + def reset(self, *, seed=None, options=None): + self.cur_context = random.choice([-1.0, 1.0]) + return np.array([self.cur_context, -self.cur_context]), {} + + def step(self, action): + rewards_for_context = { + -1.0: [-10, 0, 10], + 1.0: [10, 0, -10], + } + reward = rewards_for_context[self.cur_context][action] + return ( + np.array([-self.cur_context, self.cur_context]), + reward, + True, + False, + {"regret": 10 - reward}, + ) + + +class LinearDiscreteEnv(gym.Env): + """Samples data from linearly parameterized arms. + + The reward for context X and arm i is given by X^T * theta_i, for some + latent set of parameters {theta_i : i = 1, ..., k}. + The thetas are sampled uniformly at random, the contexts are Gaussian, + and Gaussian noise is added to the rewards. + """ + + DEFAULT_CONFIG_LINEAR = { + "feature_dim": 8, + "num_actions": 4, + "reward_noise_std": 0.01, + } + + def __init__(self, config=None): + self.config = copy.copy(self.DEFAULT_CONFIG_LINEAR) + if config is not None and type(config) is dict: + self.config.update(config) + + self.feature_dim = self.config["feature_dim"] + self.num_actions = self.config["num_actions"] + self.sigma = self.config["reward_noise_std"] + + self.action_space = Discrete(self.num_actions) + self.observation_space = Box(low=-10, high=10, shape=(self.feature_dim,)) + + self.thetas = np.random.uniform(-1, 1, (self.num_actions, self.feature_dim)) + self.thetas /= np.linalg.norm(self.thetas, axis=1, keepdims=True) + + self._elapsed_steps = 0 + self._current_context = None + + def _sample_context(self): + return np.random.normal(scale=1 / 3, size=(self.feature_dim,)) + + def reset(self, *, seed=None, options=None): + self._current_context = self._sample_context() + return self._current_context, {} + + def step(self, action): + assert ( + self._elapsed_steps is not None + ), "Cannot call env.step() beforecalling reset()" + assert action < self.num_actions, "Invalid action." + + action = int(action) + context = self._current_context + rewards = self.thetas.dot(context) + + opt_action = rewards.argmax() + + regret = rewards.max() - rewards[action] + + # Add Gaussian noise + rewards += np.random.normal(scale=self.sigma, size=rewards.shape) + + reward = rewards[action] + self._current_context = self._sample_context() + return ( + self._current_context, + reward, + True, + False, + {"regret": regret, "opt_action": opt_action}, + ) + + def render(self, mode="human"): + raise NotImplementedError + + +class WheelBanditEnv(gym.Env): + """Wheel bandit environment for 2D contexts + (see https://arxiv.org/abs/1802.09127). + """ + + DEFAULT_CONFIG_WHEEL = { + "delta": 0.5, + "mu_1": 1.2, + "mu_2": 1, + "mu_3": 50, + "std": 0.01, + } + + feature_dim = 2 + num_actions = 5 + + def __init__(self, config=None): + self.config = copy.copy(self.DEFAULT_CONFIG_WHEEL) + if config is not None and type(config) is dict: + self.config.update(config) + + self.delta = self.config["delta"] + self.mu_1 = self.config["mu_1"] + self.mu_2 = self.config["mu_2"] + self.mu_3 = self.config["mu_3"] + self.std = self.config["std"] + + self.action_space = Discrete(self.num_actions) + self.observation_space = Box(low=-1, high=1, shape=(self.feature_dim,)) + + self.means = [self.mu_1] + 4 * [self.mu_2] + self._elapsed_steps = 0 + self._current_context = None + + def _sample_context(self): + while True: + state = np.random.uniform(-1, 1, self.feature_dim) + if np.linalg.norm(state) <= 1: + return state + + def reset(self, *, seed=None, options=None): + self._current_context = self._sample_context() + return self._current_context, {} + + def step(self, action): + assert ( + self._elapsed_steps is not None + ), "Cannot call env.step() before calling reset()" + + action = int(action) + self._elapsed_steps += 1 + rewards = [ + np.random.normal(self.means[j], self.std) for j in range(self.num_actions) + ] + context = self._current_context + r_big = np.random.normal(self.mu_3, self.std) + + if np.linalg.norm(context) >= self.delta: + if context[0] > 0: + if context[1] > 0: + # First quadrant + rewards[1] = r_big + opt_action = 1 + else: + # Fourth quadrant + rewards[4] = r_big + opt_action = 4 + else: + if context[1] > 0: + # Second quadrant + rewards[2] = r_big + opt_action = 2 + else: + # Third quadrant + rewards[3] = r_big + opt_action = 3 + else: + # Smaller region where action 0 is optimal + opt_action = 0 + + reward = rewards[action] + + regret = rewards[opt_action] - reward + + self._current_context = self._sample_context() + return ( + self._current_context, + reward, + True, + False, + {"regret": regret, "opt_action": opt_action}, + ) + + def render(self, mode="human"): + raise NotImplementedError diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/guess_the_number_game.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/guess_the_number_game.py new file mode 100644 index 0000000000000000000000000000000000000000..eaac3e4becc55e06efa0d618fa8b5df032be07cd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/guess_the_number_game.py @@ -0,0 +1,89 @@ +import gymnasium as gym + +from ray.rllib.env.multi_agent_env import MultiAgentEnv + + +class GuessTheNumberGame(MultiAgentEnv): + """ + We have two players, 0 and 1. Agent 0 has to pick a number between 0, MAX-1 + at reset. Agent 1 has to guess the number by asking N questions of whether + of the form of "a is higher|lower|equal to the picked number. The + action space is MultiDiscrete [3, MAX]. For the first index 0 means lower, + 1 means higher and 2 means equal. The environment answers with yes (1) or + no (0) on the reward function. Every time step that agent 1 wastes agent 0 + gets a reward of 1. After N steps the game is terminated. If agent 1 + guesses the number correctly, it gets a reward of 100 points, otherwise it + gets a reward of 0. On the other hand if agent 0 wins they win 100 points. + The optimal policy controlling agent 1 should converge to a binary search + strategy. + """ + + MAX_NUMBER = 3 + MAX_STEPS = 20 + + def __init__(self, config=None): + super().__init__() + self._agent_ids = {0, 1} + + self.max_number = config.get("max_number", self.MAX_NUMBER) + self.max_steps = config.get("max_steps", self.MAX_STEPS) + + self._number = None + self.observation_space = gym.spaces.Discrete(2) + self.action_space = gym.spaces.MultiDiscrete([3, self.max_number]) + + def reset(self, *, seed=None, options=None): + self._step = 0 + self._number = None + # agent 0 has to pick a number. So the returned obs does not matter. + return {0: 0}, {} + + def step(self, action_dict): + # get agent 0's action + agent_0_action = action_dict.get(0) + + if agent_0_action is not None: + # ignore the first part of the action and look at the number + self._number = agent_0_action[1] + # next obs should tell agent 1 to start guessing. + # the returned reward and dones should be on agent 0 who picked a + # number. + return ( + {1: 0}, + {0: 0}, + {0: False, "__all__": False}, + {0: False, "__all__": False}, + {}, + ) + + if self._number is None: + raise ValueError( + "No number is selected by agent 0. Have you restarted " + "the environment?" + ) + + # get agent 1's action + direction, number = action_dict.get(1) + info = {} + # always the same, we don't need agent 0 to act ever again, agent 1 should keep + # guessing. + obs = {1: 0} + guessed_correctly = False + terminated = {1: False, "__all__": False} + truncated = {1: False, "__all__": False} + # everytime agent 1 does not guess correctly agent 0 gets a reward of 1. + if direction == 0: # lower + reward = {1: int(number > self._number), 0: 1} + elif direction == 1: # higher + reward = {1: int(number < self._number), 0: 1} + else: # equal + guessed_correctly = number == self._number + reward = {1: guessed_correctly * 100, 0: guessed_correctly * -100} + terminated = {1: guessed_correctly, "__all__": guessed_correctly} + + self._step += 1 + if self._step >= self.max_steps: # max number of steps episode is over + truncated["__all__"] = True + if not guessed_correctly: + reward[0] = 100 # agent 0 wins + return obs, reward, terminated, truncated, info diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_chess.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_chess.py new file mode 100644 index 0000000000000000000000000000000000000000..697ab01157f572d93539c143cbda31e0233045a1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_chess.py @@ -0,0 +1,227 @@ +from pettingzoo import AECEnv +from pettingzoo.classic.chess.chess import raw_env as chess_v5 +import copy +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from typing import Dict, Any +import chess as ch +import numpy as np + + +class MultiAgentChess(MultiAgentEnv): + """An interface to the PettingZoo MARL environment library. + See: https://github.com/Farama-Foundation/PettingZoo + Inherits from MultiAgentEnv and exposes a given AEC + (actor-environment-cycle) game from the PettingZoo project via the + MultiAgentEnv public API. + Note that the wrapper has some important limitations: + 1. All agents have the same action_spaces and observation_spaces. + Note: If, within your aec game, agents do not have homogeneous action / + observation spaces, apply SuperSuit wrappers + to apply padding functionality: https://github.com/Farama-Foundation/ + SuperSuit#built-in-multi-agent-only-functions + 2. Environments are positive sum games (-> Agents are expected to cooperate + to maximize reward). This isn't a hard restriction, it just that + standard algorithms aren't expected to work well in highly competitive + games. + + .. testcode:: + :skipif: True + + from pettingzoo.butterfly import prison_v3 + from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv + env = PettingZooEnv(prison_v3.env()) + obs = env.reset() + print(obs) + # only returns the observation for the agent which should be stepping + + .. testoutput:: + + { + 'prisoner_0': array([[[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + ..., + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]]], dtype=uint8) + } + + .. testcode:: + :skipif: True + + obs, rewards, dones, infos = env.step({ + "prisoner_0": 1 + }) + # only returns the observation, reward, info, etc, for + # the agent who's turn is next. + print(obs) + + .. testoutput:: + + { + 'prisoner_1': array([[[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + ..., + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]]], dtype=uint8) + } + + .. testcode:: + :skipif: True + + print(rewards) + + .. testoutput:: + + { + 'prisoner_1': 0 + } + + .. testcode:: + :skipif: True + + print(dones) + + .. testoutput:: + + { + 'prisoner_1': False, '__all__': False + } + + .. testcode:: + :skipif: True + + print(infos) + + .. testoutput:: + + { + 'prisoner_1': {'map_tuple': (1, 0)} + } + """ + + def __init__( + self, + config: Dict[Any, Any] = None, + env: AECEnv = None, + ): + super().__init__() + if env is None: + self.env = chess_v5() + else: + self.env = env + self.env.reset() + + self.config = config + if self.config is None: + self.config = {} + try: + self.config["random_start"] = self.config["random_start"] + except KeyError: + self.config["random_start"] = 4 + # Get first observation space, assuming all agents have equal space + self.observation_space = self.env.observation_space(self.env.agents[0]) + + # Get first action space, assuming all agents have equal space + self.action_space = self.env.action_space(self.env.agents[0]) + + assert all( + self.env.observation_space(agent) == self.observation_space + for agent in self.env.agents + ), ( + "Observation spaces for all agents must be identical. Perhaps " + "SuperSuit's pad_observations wrapper can help (useage: " + "`supersuit.aec_wrappers.pad_observations(env)`" + ) + + assert all( + self.env.action_space(agent) == self.action_space + for agent in self.env.agents + ), ( + "Action spaces for all agents must be identical. Perhaps " + "SuperSuit's pad_action_space wrapper can help (usage: " + "`supersuit.aec_wrappers.pad_action_space(env)`" + ) + self._agent_ids = set(self.env.agents) + + def random_start(self, random_moves): + self.env.board = ch.Board() + for i in range(random_moves): + self.env.board.push(np.random.choice(list(self.env.board.legal_moves))) + return self.env.board + + def observe(self): + return { + self.env.agent_selection: self.env.observe(self.env.agent_selection), + "state": self.get_state(), + } + + def reset(self, *args, **kwargs): + self.env.reset() + if self.config["random_start"] > 0: + self.random_start(self.config["random_start"]) + return ( + {self.env.agent_selection: self.env.observe(self.env.agent_selection)}, + {self.env.agent_selection: {}}, + ) + + def step(self, action): + try: + self.env.step(action[self.env.agent_selection]) + except (KeyError, IndexError): + self.env.step(action) + except AssertionError: + # Illegal action + print(action) + raise AssertionError("Illegal action") + + obs_d = {} + rew_d = {} + done_d = {} + truncated_d = {} + info_d = {} + while self.env.agents: + obs, rew, done, trunc, info = self.env.last() + a = self.env.agent_selection + obs_d[a] = obs + rew_d[a] = rew + done_d[a] = done + truncated_d[a] = trunc + info_d[a] = info + if self.env.terminations[self.env.agent_selection]: + self.env.step(None) + done_d["__all__"] = True + truncated_d["__all__"] = True + else: + done_d["__all__"] = False + truncated_d["__all__"] = False + break + + return obs_d, rew_d, done_d, truncated_d, info_d + + def close(self): + self.env.close() + + def seed(self, seed=None): + self.env.seed(seed) + + def render(self, mode="human"): + return self.env.render(mode) + + @property + def agent_selection(self): + return self.env.agent_selection + + @property + def get_sub_environments(self): + return self.env.unwrapped + + def get_state(self): + state = copy.deepcopy(self.env) + return state + + def set_state(self, state): + self.env = copy.deepcopy(state) + return self.env.observe(self.env.agent_selection) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_connect4.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_connect4.py new file mode 100644 index 0000000000000000000000000000000000000000..e87861b97cc1d2fad625a44be2d5d0a839b43b8c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/pettingzoo_connect4.py @@ -0,0 +1,213 @@ +import copy +from typing import Dict, Any + +from pettingzoo import AECEnv +from pettingzoo.classic.connect_four_v3 import raw_env as connect_four_v3 + +from ray.rllib.env.multi_agent_env import MultiAgentEnv + + +class MultiAgentConnect4(MultiAgentEnv): + """An interface to the PettingZoo MARL environment library. + See: https://github.com/Farama-Foundation/PettingZoo + Inherits from MultiAgentEnv and exposes a given AEC + (actor-environment-cycle) game from the PettingZoo project via the + MultiAgentEnv public API. + Note that the wrapper has some important limitations: + 1. All agents have the same action_spaces and observation_spaces. + Note: If, within your aec game, agents do not have homogeneous action / + observation spaces, apply SuperSuit wrappers + to apply padding functionality: https://github.com/Farama-Foundation/ + SuperSuit#built-in-multi-agent-only-functions + 2. Environments are positive sum games (-> Agents are expected to cooperate + to maximize reward). This isn't a hard restriction, it just that + standard algorithms aren't expected to work well in highly competitive + games. + + .. testcode:: + :skipif: True + + from pettingzoo.butterfly import prison_v3 + from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv + env = PettingZooEnv(prison_v3.env()) + obs = env.reset() + print(obs) + + .. testoutput:: + + # only returns the observation for the agent which should be stepping + { + 'prisoner_0': array([[[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + ..., + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]]], dtype=uint8) + } + + .. testcode:: + :skipif: True + + obs, rewards, dones, infos = env.step({ + "prisoner_0": 1 + }) + # only returns the observation, reward, info, etc, for + # the agent who's turn is next. + print(obs) + + .. testoutput:: + + { + 'prisoner_1': array([[[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + ..., + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]]], dtype=uint8) + } + + .. testcode:: + :skipif: True + + print(rewards) + + .. testoutput:: + + { + 'prisoner_1': 0 + } + + .. testcode:: + :skipif: True + + print(dones) + + .. testoutput:: + + { + 'prisoner_1': False, '__all__': False + } + + .. testcode:: + :skipif: True + + print(infos) + + .. testoutput:: + + { + 'prisoner_1': {'map_tuple': (1, 0)} + } + """ + + def __init__( + self, + config: Dict[Any, Any] = None, + env: AECEnv = None, + ): + super().__init__() + if env is None: + self.env = connect_four_v3() + else: + self.env = env + self.env.reset() + + self.config = config + # Get first observation space, assuming all agents have equal space + self.observation_space = self.env.observation_space(self.env.agents[0]) + + # Get first action space, assuming all agents have equal space + self.action_space = self.env.action_space(self.env.agents[0]) + + assert all( + self.env.observation_space(agent) == self.observation_space + for agent in self.env.agents + ), ( + "Observation spaces for all agents must be identical. Perhaps " + "SuperSuit's pad_observations wrapper can help (useage: " + "`supersuit.aec_wrappers.pad_observations(env)`" + ) + + assert all( + self.env.action_space(agent) == self.action_space + for agent in self.env.agents + ), ( + "Action spaces for all agents must be identical. Perhaps " + "SuperSuit's pad_action_space wrapper can help (usage: " + "`supersuit.aec_wrappers.pad_action_space(env)`" + ) + self._agent_ids = set(self.env.agents) + + def observe(self): + return { + self.env.agent_selection: self.env.observe(self.env.agent_selection), + "state": self.get_state(), + } + + def reset(self, *args, **kwargs): + self.env.reset() + return ( + {self.env.agent_selection: self.env.observe(self.env.agent_selection)}, + {self.env.agent_selection: {}}, + ) + + def step(self, action): + try: + self.env.step(action[self.env.agent_selection]) + except (KeyError, IndexError): + self.env.step(action) + except AssertionError: + # Illegal action + print(action) + raise AssertionError("Illegal action") + + obs_d = {} + rew_d = {} + done_d = {} + trunc_d = {} + info_d = {} + while self.env.agents: + obs, rew, done, trunc, info = self.env.last() + a = self.env.agent_selection + obs_d[a] = obs + rew_d[a] = rew + done_d[a] = done + trunc_d[a] = trunc + info_d[a] = info + if self.env.terminations[self.env.agent_selection]: + self.env.step(None) + done_d["__all__"] = True + trunc_d["__all__"] = True + else: + done_d["__all__"] = False + trunc_d["__all__"] = False + break + + return obs_d, rew_d, done_d, trunc_d, info_d + + def close(self): + self.env.close() + + def seed(self, seed=None): + self.env.seed(seed) + + def render(self, mode="human"): + return self.env.render(mode) + + @property + def agent_selection(self): + return self.env.agent_selection + + @property + def get_sub_environments(self): + return self.env.unwrapped + + def get_state(self): + state = copy.deepcopy(self.env) + return state + + def set_state(self, state): + self.env = copy.deepcopy(state) + return self.env.observe(self.env.agent_selection) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/rock_paper_scissors.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/rock_paper_scissors.py new file mode 100644 index 0000000000000000000000000000000000000000..aa363ae75f2bd79dcd3f97a888f9bb7f33c43771 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/rock_paper_scissors.py @@ -0,0 +1,125 @@ +# __sphinx_doc_1_begin__ +import gymnasium as gym + +from ray.rllib.env.multi_agent_env import MultiAgentEnv + + +class RockPaperScissors(MultiAgentEnv): + """Two-player environment for the famous rock paper scissors game. + + # __sphinx_doc_1_end__ + Optionally, the "Sheldon Cooper extension" can be activated by passing + `sheldon_cooper_mode=True` into the constructor, in which case two more moves + are allowed: Spock and Lizard. Spock is poisoned by Lizard, disproven by Paper, but + crushes Rock and smashes Scissors. Lizard poisons Spock and eats Paper, but is + decapitated by Scissors and crushed by Rock. + + # __sphinx_doc_2_begin__ + Both players always move simultaneously over a course of 10 timesteps in total. + The winner of each timestep receives reward of +1, the losing player -1.0. + + The observation of each player is the last opponent action. + """ + + ROCK = 0 + PAPER = 1 + SCISSORS = 2 + LIZARD = 3 + SPOCK = 4 + + WIN_MATRIX = { + (ROCK, ROCK): (0, 0), + (ROCK, PAPER): (-1, 1), + (ROCK, SCISSORS): (1, -1), + (PAPER, ROCK): (1, -1), + (PAPER, PAPER): (0, 0), + (PAPER, SCISSORS): (-1, 1), + (SCISSORS, ROCK): (-1, 1), + (SCISSORS, PAPER): (1, -1), + (SCISSORS, SCISSORS): (0, 0), + } + # __sphinx_doc_2_end__ + + WIN_MATRIX.update( + { + # Sheldon Cooper mode: + (LIZARD, LIZARD): (0, 0), + (LIZARD, SPOCK): (1, -1), # Lizard poisons Spock + (LIZARD, ROCK): (-1, 1), # Rock crushes lizard + (LIZARD, PAPER): (1, -1), # Lizard eats paper + (LIZARD, SCISSORS): (-1, 1), # Scissors decapitate lizard + (ROCK, LIZARD): (1, -1), # Rock crushes lizard + (PAPER, LIZARD): (-1, 1), # Lizard eats paper + (SCISSORS, LIZARD): (1, -1), # Scissors decapitate lizard + (SPOCK, SPOCK): (0, 0), + (SPOCK, LIZARD): (-1, 1), # Lizard poisons Spock + (SPOCK, ROCK): (1, -1), # Spock vaporizes rock + (SPOCK, PAPER): (-1, 1), # Paper disproves Spock + (SPOCK, SCISSORS): (1, -1), # Spock smashes scissors + (ROCK, SPOCK): (-1, 1), # Spock vaporizes rock + (PAPER, SPOCK): (1, -1), # Paper disproves Spock + (SCISSORS, SPOCK): (-1, 1), # Spock smashes scissors + } + ) + + # __sphinx_doc_3_begin__ + def __init__(self, config=None): + super().__init__() + + self.agents = self.possible_agents = ["player1", "player2"] + + # The observations are always the last taken actions. Hence observation- and + # action spaces are identical. + self.observation_spaces = self.action_spaces = { + "player1": gym.spaces.Discrete(3), + "player2": gym.spaces.Discrete(3), + } + self.last_move = None + self.num_moves = 0 + # __sphinx_doc_3_end__ + + self.sheldon_cooper_mode = False + if config.get("sheldon_cooper_mode"): + self.sheldon_cooper_mode = True + self.action_spaces = self.observation_spaces = { + "player1": gym.spaces.Discrete(5), + "player2": gym.spaces.Discrete(5), + } + + # __sphinx_doc_4_begin__ + def reset(self, *, seed=None, options=None): + self.num_moves = 0 + + # The first observation should not matter (none of the agents has moved yet). + # Set them to 0. + return { + "player1": 0, + "player2": 0, + }, {} # <- empty infos dict + + # __sphinx_doc_4_end__ + + # __sphinx_doc_5_begin__ + def step(self, action_dict): + self.num_moves += 1 + + move1 = action_dict["player1"] + move2 = action_dict["player2"] + + # Set the next observations (simply use the other player's action). + # Note that because we are publishing both players in the observations dict, + # we expect both players to act in the next `step()` (simultaneous stepping). + observations = {"player1": move2, "player2": move1} + + # Compute rewards for each player based on the win-matrix. + r1, r2 = self.WIN_MATRIX[move1, move2] + rewards = {"player1": r1, "player2": r2} + + # Terminate the entire episode (for all agents) once 10 moves have been made. + terminateds = {"__all__": self.num_moves >= 10} + + # Leave truncateds and infos empty. + return observations, rewards, terminateds, {}, {} + + +# __sphinx_doc_5_end__ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/tic_tac_toe.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/tic_tac_toe.py new file mode 100644 index 0000000000000000000000000000000000000000..ceb08422092fdd7f19ab7e63d74f03f1615967a8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/tic_tac_toe.py @@ -0,0 +1,144 @@ +# __sphinx_doc_1_begin__ +import gymnasium as gym +import numpy as np + +from ray.rllib.env.multi_agent_env import MultiAgentEnv + + +class TicTacToe(MultiAgentEnv): + """A two-player game in which any player tries to complete one row in a 3x3 field. + + The observation space is Box(0.0, 1.0, (9,)), where each index represents a distinct + field on a 3x3 board and values of 0.0 mean the field is empty, -1.0 means + the opponend owns the field, and 1.0 means we occupy the field: + ---------- + | 0| 1| 2| + ---------- + | 3| 4| 5| + ---------- + | 6| 7| 8| + ---------- + + The action space is Discrete(9) and actions landing on an already occupied field + are simply ignored (and thus useless to the player taking these actions). + + Once a player completes a row, they receive +1.0 reward, the losing player receives + -1.0 reward. In all other cases, both players receive 0.0 reward. + """ + + # __sphinx_doc_1_end__ + + # __sphinx_doc_2_begin__ + def __init__(self, config=None): + super().__init__() + + # Define the agents in the game. + self.agents = self.possible_agents = ["player1", "player2"] + + # Each agent observes a 9D tensor, representing the 3x3 fields of the board. + # A 0 means an empty field, a 1 represents a piece of player 1, a -1 a piece of + # player 2. + self.observation_spaces = { + "player1": gym.spaces.Box(-1.0, 1.0, (9,), np.float32), + "player2": gym.spaces.Box(-1.0, 1.0, (9,), np.float32), + } + # Each player has 9 actions, encoding the 9 fields each player can place a piece + # on during their turn. + self.action_spaces = { + "player1": gym.spaces.Discrete(9), + "player2": gym.spaces.Discrete(9), + } + + self.board = None + self.current_player = None + + # __sphinx_doc_2_end__ + + # __sphinx_doc_3_begin__ + def reset(self, *, seed=None, options=None): + self.board = [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ] + # Pick a random player to start the game. + self.current_player = np.random.choice(["player1", "player2"]) + # Return observations dict (only with the starting player, which is the one + # we expect to act next). + return { + self.current_player: np.array(self.board, np.float32), + }, {} + + # __sphinx_doc_3_end__ + + # __sphinx_doc_4_begin__ + def step(self, action_dict): + action = action_dict[self.current_player] + + # Create a rewards-dict (containing the rewards of the agent that just acted). + rewards = {self.current_player: 0.0} + # Create a terminateds-dict with the special `__all__` agent ID, indicating that + # if True, the episode ends for all agents. + terminateds = {"__all__": False} + + opponent = "player1" if self.current_player == "player2" else "player2" + + # Penalize trying to place a piece on an already occupied field. + if self.board[action] != 0: + rewards[self.current_player] -= 5.0 + # Change the board according to the (valid) action taken. + else: + self.board[action] = 1 if self.current_player == "player1" else -1 + + # After having placed a new piece, figure out whether the current player + # won or not. + if self.current_player == "player1": + win_val = [1, 1, 1] + else: + win_val = [-1, -1, -1] + if ( + # Horizontal win. + self.board[:3] == win_val + or self.board[3:6] == win_val + or self.board[6:] == win_val + # Vertical win. + or self.board[0:7:3] == win_val + or self.board[1:8:3] == win_val + or self.board[2:9:3] == win_val + # Diagonal win. + or self.board[::3] == win_val + or self.board[2:7:2] == win_val + ): + # Final reward is +5 for victory and -5 for a loss. + rewards[self.current_player] += 5.0 + rewards[opponent] = -5.0 + + # Episode is done and needs to be reset for a new game. + terminateds["__all__"] = True + + # The board might also be full w/o any player having won/lost. + # In this case, we simply end the episode and none of the players receives + # +1 or -1 reward. + elif 0 not in self.board: + terminateds["__all__"] = True + + # Flip players and return an observations dict with only the next player to + # make a move in it. + self.current_player = opponent + + return ( + {self.current_player: np.array(self.board, np.float32)}, + rewards, + terminateds, + {}, + {}, + ) + + +# __sphinx_doc_4_end__ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/two_step_game.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/two_step_game.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf606f7b405418c27cb3dab32f952e11562142d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/multi_agent/two_step_game.py @@ -0,0 +1,123 @@ +from gymnasium.spaces import Dict, Discrete, MultiDiscrete, Tuple +import numpy as np + +from ray.rllib.env.multi_agent_env import MultiAgentEnv, ENV_STATE + + +class TwoStepGame(MultiAgentEnv): + action_space = Discrete(2) + + def __init__(self, env_config): + super().__init__() + self.action_space = Discrete(2) + self.state = None + self.agent_1 = 0 + self.agent_2 = 1 + # MADDPG emits action logits instead of actual discrete actions + self.actions_are_logits = env_config.get("actions_are_logits", False) + self.one_hot_state_encoding = env_config.get("one_hot_state_encoding", False) + self.with_state = env_config.get("separate_state_space", False) + self._agent_ids = {0, 1} + if not self.one_hot_state_encoding: + self.observation_space = Discrete(6) + self.with_state = False + else: + # Each agent gets the full state (one-hot encoding of which of the + # three states are active) as input with the receiving agent's + # ID (1 or 2) concatenated onto the end. + if self.with_state: + self.observation_space = Dict( + { + "obs": MultiDiscrete([2, 2, 2, 3]), + ENV_STATE: MultiDiscrete([2, 2, 2]), + } + ) + else: + self.observation_space = MultiDiscrete([2, 2, 2, 3]) + + def reset(self, *, seed=None, options=None): + if seed is not None: + np.random.seed(seed) + self.state = np.array([1, 0, 0]) + return self._obs(), {} + + def step(self, action_dict): + if self.actions_are_logits: + action_dict = { + k: np.random.choice([0, 1], p=v) for k, v in action_dict.items() + } + + state_index = np.flatnonzero(self.state) + if state_index == 0: + action = action_dict[self.agent_1] + assert action in [0, 1], action + if action == 0: + self.state = np.array([0, 1, 0]) + else: + self.state = np.array([0, 0, 1]) + global_rew = 0 + terminated = False + elif state_index == 1: + global_rew = 7 + terminated = True + else: + if action_dict[self.agent_1] == 0 and action_dict[self.agent_2] == 0: + global_rew = 0 + elif action_dict[self.agent_1] == 1 and action_dict[self.agent_2] == 1: + global_rew = 8 + else: + global_rew = 1 + terminated = True + + rewards = {self.agent_1: global_rew / 2.0, self.agent_2: global_rew / 2.0} + obs = self._obs() + terminateds = {"__all__": terminated} + truncateds = {"__all__": False} + infos = { + self.agent_1: {"done": terminateds["__all__"]}, + self.agent_2: {"done": terminateds["__all__"]}, + } + return obs, rewards, terminateds, truncateds, infos + + def _obs(self): + if self.with_state: + return { + self.agent_1: {"obs": self.agent_1_obs(), ENV_STATE: self.state}, + self.agent_2: {"obs": self.agent_2_obs(), ENV_STATE: self.state}, + } + else: + return {self.agent_1: self.agent_1_obs(), self.agent_2: self.agent_2_obs()} + + def agent_1_obs(self): + if self.one_hot_state_encoding: + return np.concatenate([self.state, [1]]) + else: + return np.flatnonzero(self.state)[0] + + def agent_2_obs(self): + if self.one_hot_state_encoding: + return np.concatenate([self.state, [2]]) + else: + return np.flatnonzero(self.state)[0] + 3 + + +class TwoStepGameWithGroupedAgents(MultiAgentEnv): + def __init__(self, env_config): + super().__init__() + env = TwoStepGame(env_config) + tuple_obs_space = Tuple([env.observation_space, env.observation_space]) + tuple_act_space = Tuple([env.action_space, env.action_space]) + self._agent_ids = {"agents"} + self.env = env.with_agent_groups( + groups={"agents": [0, 1]}, + obs_space=tuple_obs_space, + act_space=tuple_act_space, + ) + self.observation_space = Dict({"agents": self.env.observation_space}) + self.action_space = Dict({"agents": self.env.action_space}) + + def reset(self, *, seed=None, options=None): + return self.env.reset(seed=seed, options=options) + + def step(self, actions): + return self.env.step(actions) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/nested_space_repeat_after_me_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/nested_space_repeat_after_me_env.py new file mode 100644 index 0000000000000000000000000000000000000000..233f72f3610d629f905da1b8819b82065dc43296 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/nested_space_repeat_after_me_env.py @@ -0,0 +1,50 @@ +import gymnasium as gym +from gymnasium.spaces import Box, Dict, Discrete, Tuple +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.utils.spaces.space_utils import flatten_space + + +class NestedSpaceRepeatAfterMeEnv(gym.Env): + """Env for which policy has to repeat the (possibly complex) observation. + + The action space and observation spaces are always the same and may be + arbitrarily nested Dict/Tuple Spaces. + Rewards are given for exactly matching Discrete sub-actions and for being + as close as possible for Box sub-actions. + """ + + def __init__(self, config=None): + config = config or {} + self.observation_space = config.get( + "space", Tuple([Discrete(2), Dict({"a": Box(-1.0, 1.0, (2,))})]) + ) + self.action_space = self.observation_space + self.flattened_action_space = flatten_space(self.action_space) + self.episode_len = config.get("episode_len", 100) + + def reset(self, *, seed=None, options=None): + self.steps = 0 + return self._next_obs(), {} + + def step(self, action): + self.steps += 1 + action = tree.flatten(action) + reward = 0.0 + for a, o, space in zip( + action, self.current_obs_flattened, self.flattened_action_space + ): + # Box: -abs(diff). + if isinstance(space, gym.spaces.Box): + reward -= np.sum(np.abs(a - o)) + # Discrete: +1.0 if exact match. + if isinstance(space, gym.spaces.Discrete): + reward += 1.0 if a == o else 0.0 + truncated = self.steps >= self.episode_len + return self._next_obs(), reward, False, truncated, {} + + def _next_obs(self): + self.current_obs = self.observation_space.sample() + self.current_obs_flattened = tree.flatten(self.current_obs) + return self.current_obs diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/parametric_actions_cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/parametric_actions_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..94fb78f417b8f4caab84f569b2926f261250459b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/parametric_actions_cartpole.py @@ -0,0 +1,145 @@ +import random + +import gymnasium as gym +import numpy as np +from gymnasium.spaces import Box, Dict, Discrete + + +class ParametricActionsCartPole(gym.Env): + """Parametric action version of CartPole. + + In this env there are only ever two valid actions, but we pretend there are + actually up to `max_avail_actions` actions that can be taken, and the two + valid actions are randomly hidden among this set. + + At each step, we emit a dict of: + - the actual cart observation + - a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail) + - the list of action embeddings (w/ zeroes for invalid actions) (e.g., + [[0, 0], + [0, 0], + [-0.2322, -0.2569], + [0, 0], + [0, 0], + [0.7878, 1.2297]] for max_avail_actions=6) + + In a real environment, the actions embeddings would be larger than two + units of course, and also there would be a variable number of valid actions + per step instead of always [LEFT, RIGHT]. + """ + + def __init__(self, max_avail_actions): + # Use simple random 2-unit action embeddings for [LEFT, RIGHT] + self.left_action_embed = np.random.randn(2) + self.right_action_embed = np.random.randn(2) + self.action_space = Discrete(max_avail_actions) + self.wrapped = gym.make("CartPole-v1") + self.observation_space = Dict( + { + "action_mask": Box(0, 1, shape=(max_avail_actions,), dtype=np.int8), + "avail_actions": Box(-10, 10, shape=(max_avail_actions, 2)), + "cart": self.wrapped.observation_space, + } + ) + + def update_avail_actions(self): + self.action_assignments = np.array( + [[0.0, 0.0]] * self.action_space.n, dtype=np.float32 + ) + self.action_mask = np.array([0.0] * self.action_space.n, dtype=np.int8) + self.left_idx, self.right_idx = random.sample(range(self.action_space.n), 2) + self.action_assignments[self.left_idx] = self.left_action_embed + self.action_assignments[self.right_idx] = self.right_action_embed + self.action_mask[self.left_idx] = 1 + self.action_mask[self.right_idx] = 1 + + def reset(self, *, seed=None, options=None): + self.update_avail_actions() + obs, infos = self.wrapped.reset() + return { + "action_mask": self.action_mask, + "avail_actions": self.action_assignments, + "cart": obs, + }, infos + + def step(self, action): + if action == self.left_idx: + actual_action = 0 + elif action == self.right_idx: + actual_action = 1 + else: + raise ValueError( + "Chosen action was not one of the non-zero action embeddings", + action, + self.action_assignments, + self.action_mask, + self.left_idx, + self.right_idx, + ) + orig_obs, rew, done, truncated, info = self.wrapped.step(actual_action) + self.update_avail_actions() + self.action_mask = self.action_mask.astype(np.int8) + obs = { + "action_mask": self.action_mask, + "avail_actions": self.action_assignments, + "cart": orig_obs, + } + return obs, rew, done, truncated, info + + +class ParametricActionsCartPoleNoEmbeddings(gym.Env): + """Same as the above ParametricActionsCartPole. + + However, action embeddings are not published inside observations, + but will be learnt by the model. + + At each step, we emit a dict of: + - the actual cart observation + - a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail) + - action embeddings (w/ "dummy embedding" for invalid actions) are + outsourced in the model and will be learned. + """ + + def __init__(self, max_avail_actions): + # Randomly set which two actions are valid and available. + self.left_idx, self.right_idx = random.sample(range(max_avail_actions), 2) + self.valid_avail_actions_mask = np.array( + [0.0] * max_avail_actions, dtype=np.int8 + ) + self.valid_avail_actions_mask[self.left_idx] = 1 + self.valid_avail_actions_mask[self.right_idx] = 1 + self.action_space = Discrete(max_avail_actions) + self.wrapped = gym.make("CartPole-v1") + self.observation_space = Dict( + { + "valid_avail_actions_mask": Box(0, 1, shape=(max_avail_actions,)), + "cart": self.wrapped.observation_space, + } + ) + + def reset(self, *, seed=None, options=None): + obs, infos = self.wrapped.reset() + return { + "valid_avail_actions_mask": self.valid_avail_actions_mask, + "cart": obs, + }, infos + + def step(self, action): + if action == self.left_idx: + actual_action = 0 + elif action == self.right_idx: + actual_action = 1 + else: + raise ValueError( + "Chosen action was not one of the non-zero action embeddings", + action, + self.valid_avail_actions_mask, + self.left_idx, + self.right_idx, + ) + orig_obs, rew, done, truncated, info = self.wrapped.step(actual_action) + obs = { + "valid_avail_actions_mask": self.valid_avail_actions_mask, + "cart": orig_obs, + } + return obs, rew, done, truncated, info diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/random_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/random_env.py new file mode 100644 index 0000000000000000000000000000000000000000..5f413a597c9abc17180aa5900bd49f5077f45d93 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/random_env.py @@ -0,0 +1,125 @@ +import copy +import gymnasium as gym +from gymnasium.spaces import Discrete, Tuple +import numpy as np + +from ray.rllib.examples.envs.classes.multi_agent import make_multi_agent + + +class RandomEnv(gym.Env): + """A randomly acting environment. + + Can be instantiated with arbitrary action-, observation-, and reward + spaces. Observations and rewards are generated by simply sampling from the + observation/reward spaces. The probability of a `terminated=True` after each + action can be configured, as well as the max episode length. + """ + + def __init__(self, config=None): + config = config or {} + + # Action space. + self.action_space = config.get("action_space", Discrete(2)) + # Observation space from which to sample. + self.observation_space = config.get("observation_space", Discrete(2)) + # Reward space from which to sample. + self.reward_space = config.get( + "reward_space", + gym.spaces.Box(low=-1.0, high=1.0, shape=(), dtype=np.float32), + ) + self.static_samples = config.get("static_samples", False) + if self.static_samples: + self.observation_sample = self.observation_space.sample() + self.reward_sample = self.reward_space.sample() + + # Chance that an episode ends at any step. + # Note that a max episode length can be specified via + # `max_episode_len`. + self.p_terminated = config.get("p_terminated") + if self.p_terminated is None: + self.p_terminated = config.get("p_done", 0.1) + # A max episode length. Even if the `p_terminated` sampling does not lead + # to a terminus, the episode will end after at most this many + # timesteps. + # Set to 0 or None for using no limit on the episode length. + self.max_episode_len = config.get("max_episode_len", None) + # Whether to check action bounds. + self.check_action_bounds = config.get("check_action_bounds", False) + # Steps taken so far (after last reset). + self.steps = 0 + + def reset(self, *, seed=None, options=None): + self.steps = 0 + if not self.static_samples: + return self.observation_space.sample(), {} + else: + return copy.deepcopy(self.observation_sample), {} + + def step(self, action): + if self.check_action_bounds and not self.action_space.contains(action): + raise ValueError( + "Illegal action for {}: {}".format(self.action_space, action) + ) + if isinstance(self.action_space, Tuple) and len(action) != len( + self.action_space.spaces + ): + raise ValueError( + "Illegal action for {}: {}".format(self.action_space, action) + ) + + self.steps += 1 + terminated = False + truncated = False + # We are `truncated` as per our max-episode-len. + if self.max_episode_len and self.steps >= self.max_episode_len: + truncated = True + # Max episode length not reached yet -> Sample `terminated` via `p_terminated`. + elif self.p_terminated > 0.0: + terminated = bool( + np.random.choice( + [True, False], p=[self.p_terminated, 1.0 - self.p_terminated] + ) + ) + + if not self.static_samples: + return ( + self.observation_space.sample(), + self.reward_space.sample(), + terminated, + truncated, + {}, + ) + else: + return ( + copy.deepcopy(self.observation_sample), + copy.deepcopy(self.reward_sample), + terminated, + truncated, + {}, + ) + + +# Multi-agent version of the RandomEnv. +RandomMultiAgentEnv = make_multi_agent(lambda c: RandomEnv(c)) + + +# Large observation space "pre-compiled" random env (for testing). +class RandomLargeObsSpaceEnv(RandomEnv): + def __init__(self, config=None): + config = config or {} + config.update({"observation_space": gym.spaces.Box(-1.0, 1.0, (5000,))}) + super().__init__(config=config) + + +# Large observation space + cont. actions "pre-compiled" random env +# (for testing). +class RandomLargeObsSpaceEnvContActions(RandomEnv): + def __init__(self, config=None): + config = config or {} + config.update( + { + "observation_space": gym.spaces.Box(-1.0, 1.0, (5000,)), + "action_space": gym.spaces.Box(-1.0, 1.0, (5,)), + } + ) + super().__init__(config=config) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/recommender_system_envs_with_recsim.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/recommender_system_envs_with_recsim.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f7a28e4b39ca6587dffdd89a065ce9226187ea --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/recommender_system_envs_with_recsim.py @@ -0,0 +1,108 @@ +"""Examples for RecSim envs ready to be used by RLlib Algorithms. + +RecSim is a configurable recommender systems simulation platform. +Source: https://github.com/google-research/recsim +""" + +from recsim import choice_model +from recsim.environments import ( + long_term_satisfaction as lts, + interest_evolution as iev, + interest_exploration as iex, +) + +from ray.rllib.env.wrappers.recsim import make_recsim_env +from ray.tune import register_env + +# Some built-in RecSim envs to test with. +# --------------------------------------- + +# Long-term satisfaction env: User has to pick from items that are either +# a) unhealthy, but taste good, or b) healthy, but have bad taste. +# Best strategy is to pick a mix of both to ensure long-term +# engagement. + + +def lts_user_model_creator(env_ctx): + return lts.LTSUserModel( + env_ctx["slate_size"], + user_state_ctor=lts.LTSUserState, + response_model_ctor=lts.LTSResponse, + ) + + +def lts_document_sampler_creator(env_ctx): + return lts.LTSDocumentSampler() + + +LongTermSatisfactionRecSimEnv = make_recsim_env( + recsim_user_model_creator=lts_user_model_creator, + recsim_document_sampler_creator=lts_document_sampler_creator, + reward_aggregator=lts.clicked_engagement_reward, +) + + +# Interest exploration env: Models the problem of active exploration +# of user interests. It is meant to illustrate popularity bias in +# recommender systems, where myopic maximization of engagement leads +# to bias towards documents that have wider appeal, +# whereas niche user interests remain unexplored. +def iex_user_model_creator(env_ctx): + return iex.IEUserModel( + env_ctx["slate_size"], + user_state_ctor=iex.IEUserState, + response_model_ctor=iex.IEResponse, + seed=env_ctx["seed"], + ) + + +def iex_document_sampler_creator(env_ctx): + return iex.IETopicDocumentSampler(seed=env_ctx["seed"]) + + +InterestExplorationRecSimEnv = make_recsim_env( + recsim_user_model_creator=iex_user_model_creator, + recsim_document_sampler_creator=iex_document_sampler_creator, + reward_aggregator=iex.total_clicks_reward, +) + + +# Interest evolution env: See https://github.com/google-research/recsim +# for more information. +def iev_user_model_creator(env_ctx): + return iev.IEvUserModel( + env_ctx["slate_size"], + choice_model_ctor=choice_model.MultinomialProportionalChoiceModel, + response_model_ctor=iev.IEvResponse, + user_state_ctor=iev.IEvUserState, + seed=env_ctx["seed"], + ) + + +# Extend IEvVideo to fix a bug caused by None cluster_ids. +class SingleClusterIEvVideo(iev.IEvVideo): + def __init__(self, doc_id, features, video_length=None, quality=None): + super(SingleClusterIEvVideo, self).__init__( + doc_id=doc_id, + features=features, + cluster_id=0, # single cluster. + video_length=video_length, + quality=quality, + ) + + +def iev_document_sampler_creator(env_ctx): + return iev.UtilityModelVideoSampler(doc_ctor=iev.IEvVideo, seed=env_ctx["seed"]) + + +InterestEvolutionRecSimEnv = make_recsim_env( + recsim_user_model_creator=iev_user_model_creator, + recsim_document_sampler_creator=iev_document_sampler_creator, + reward_aggregator=iev.clicked_watchtime_reward, +) + + +# Backward compatibility. +register_env( + name="RecSim-v1", env_creator=lambda env_ctx: InterestEvolutionRecSimEnv(env_ctx) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_after_me_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_after_me_env.py new file mode 100644 index 0000000000000000000000000000000000000000..0a87f60ac6c55c6783205e3b9e9ce7392b5f9396 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_after_me_env.py @@ -0,0 +1,47 @@ +import gymnasium as gym +from gymnasium.spaces import Box, Discrete +import numpy as np + + +class RepeatAfterMeEnv(gym.Env): + """Env in which the observation at timestep minus n must be repeated.""" + + def __init__(self, config=None): + config = config or {} + if config.get("continuous"): + self.observation_space = Box(-1.0, 1.0, (2,)) + else: + self.observation_space = Discrete(2) + + self.action_space = self.observation_space + # Note: Set `repeat_delay` to 0 for simply repeating the seen + # observation (no delay). + self.delay = config.get("repeat_delay", 1) + self.episode_len = config.get("episode_len", 100) + self.history = [] + + def reset(self, *, seed=None, options=None): + self.history = [0] * self.delay + return self._next_obs(), {} + + def step(self, action): + obs = self.history[-(1 + self.delay)] + + reward = 0.0 + # Box: -abs(diff). + if isinstance(self.action_space, Box): + reward = -np.sum(np.abs(action - obs)) + # Discrete: +1.0 if exact match, -1.0 otherwise. + if isinstance(self.action_space, Discrete): + reward = 1.0 if action == obs else -1.0 + + done = truncated = len(self.history) > self.episode_len + return self._next_obs(), reward, done, truncated, {} + + def _next_obs(self): + if isinstance(self.observation_space, Box): + token = np.random.random(size=(2,)) + else: + token = np.random.choice([0, 1]) + self.history.append(token) + return token diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_initial_obs_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_initial_obs_env.py new file mode 100644 index 0000000000000000000000000000000000000000..d1d43c560424caaa8fac5b433f536a72d5f4adc5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/repeat_initial_obs_env.py @@ -0,0 +1,32 @@ +import gymnasium as gym +from gymnasium.spaces import Discrete +import random + + +class RepeatInitialObsEnv(gym.Env): + """Env in which the initial observation has to be repeated all the time. + + Runs for n steps. + r=1 if action correct, -1 otherwise (max. R=100). + """ + + def __init__(self, episode_len=100): + self.observation_space = Discrete(2) + self.action_space = Discrete(2) + self.token = None + self.episode_len = episode_len + self.num_steps = 0 + + def reset(self, *, seed=None, options=None): + self.token = random.choice([0, 1]) + self.num_steps = 0 + return self.token, {} + + def step(self, action): + if action == self.token: + reward = 1 + else: + reward = -1 + self.num_steps += 1 + done = truncated = self.num_steps >= self.episode_len + return 0, reward, done, truncated, {} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_corridor.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_corridor.py new file mode 100644 index 0000000000000000000000000000000000000000..9088f73dbd374da7f7d1312e6ed68c1d5c25444e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_corridor.py @@ -0,0 +1,42 @@ +import gymnasium as gym +from gymnasium.spaces import Box, Discrete +import numpy as np + + +class SimpleCorridor(gym.Env): + """Example of a custom env in which you have to walk down a corridor. + + You can configure the length of the corridor via the env config.""" + + def __init__(self, config=None): + config = config or {} + + self.action_space = Discrete(2) + self.observation_space = Box(0.0, 999.0, shape=(1,), dtype=np.float32) + + self.set_corridor_length(config.get("corridor_length", 10)) + + self._cur_pos = 0 + + def set_corridor_length(self, length): + self.end_pos = length + print(f"Set corridor length to {self.end_pos}") + assert self.end_pos <= 999, "The maximum `corridor_length` allowed is 999!" + + def reset(self, *, seed=None, options=None): + self._cur_pos = 0.0 + return self._get_obs(), {} + + def step(self, action): + assert action in [0, 1], action + if action == 0 and self._cur_pos > 0: + self._cur_pos -= 1.0 + elif action == 1: + self._cur_pos += 1.0 + terminated = self._cur_pos >= self.end_pos + truncated = False + reward = 1.0 if terminated else -0.01 + return self._get_obs(), reward, terminated, truncated, {} + + def _get_obs(self): + return np.array([self._cur_pos], np.float32) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_rpg.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_rpg.py new file mode 100644 index 0000000000000000000000000000000000000000..7de7390bd96dd72e00c3871885ecc600a359e0b5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/simple_rpg.py @@ -0,0 +1,49 @@ +import gymnasium as gym +from gymnasium.spaces import Discrete, Box, Dict + +from ray.rllib.utils.spaces.repeated import Repeated + +# Constraints on the Repeated space. +MAX_PLAYERS = 4 +MAX_ITEMS = 7 +MAX_EFFECTS = 2 + + +class SimpleRPG(gym.Env): + """Example of a custom env with a complex, structured observation. + + The observation is a list of players, each of which is a Dict of + attributes, and may further hold a list of items (categorical space). + + Note that the env doesn't train, it's just a dummy example to show how to + use spaces.Repeated in a custom model (see CustomRPGModel below). + """ + + def __init__(self, config): + self.cur_pos = 0 + self.action_space = Discrete(4) + + # Represents an item. + self.item_space = Discrete(5) + + # Represents an effect on the player. + self.effect_space = Box(9000, 9999, shape=(4,)) + + # Represents a player. + self.player_space = Dict( + { + "location": Box(-100, 100, shape=(2,)), + "status": Box(-1, 1, shape=(10,)), + "items": Repeated(self.item_space, max_len=MAX_ITEMS), + "effects": Repeated(self.effect_space, max_len=MAX_EFFECTS), + } + ) + + # Observation is a list of players. + self.observation_space = Repeated(self.player_space, max_len=MAX_PLAYERS) + + def reset(self, *, seed=None, options=None): + return self.observation_space.sample(), {} + + def step(self, action): + return self.observation_space.sample(), 1, True, False, {} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/six_room_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/six_room_env.py new file mode 100644 index 0000000000000000000000000000000000000000..2a4b1a2a41d51084e44e6b6e267ddb309bc1c22d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/six_room_env.py @@ -0,0 +1,315 @@ +import gymnasium as gym + +from ray.rllib.env.multi_agent_env import MultiAgentEnv + + +# Map representation: Always six rooms (as the name suggests) with doors in between. +MAPS = { + "small": [ + "WWWWWWWWWWWWW", + "W W W W", + "W W W", + "W W W", + "W WWWW WWWW W", + "W W W W", + "W W W", + "W W GW", + "WWWWWWWWWWWWW", + ], + "medium": [ + "WWWWWWWWWWWWWWWWWWW", + "W W W W", + "W W W", + "W W W", + "W WWWWWWW WWWWWWW W", + "W W W W", + "W W W", + "W W GW", + "WWWWWWWWWWWWWWWWWWW", + ], + "large": [ + "WWWWWWWWWWWWWWWWWWWWWWWWW", + "W W W W", + "W W W W", + "W W W", + "W W W", + "W W W W", + "WW WWWWWWWWW WWWWWWWWWW W", + "W W W W", + "W W W", + "W W W W", + "W W W", + "W W W GW", + "WWWWWWWWWWWWWWWWWWWWWWWWW", + ], +} + + +class SixRoomEnv(gym.Env): + """A grid-world with six rooms (arranged as 2x3), which are connected by doors. + + The agent starts in the upper left room and has to reach a designated goal state + in one of the rooms using primitive actions up, left, down, and right. + + The agent receives a small penalty of -0.01 on each step and a reward of +10.0 when + reaching the goal state. + """ + + def __init__(self, config=None): + super().__init__() + + # User can provide a custom map or a recognized map name (small, medium, large). + self.map = config.get("custom_map", MAPS.get(config.get("map"), MAPS["small"])) + self.time_limit = config.get("time_limit", 50) + + # Define observation space: Discrete, index fields. + self.observation_space = gym.spaces.Discrete(len(self.map) * len(self.map[0])) + # Primitive actions: up, down, left, right. + self.action_space = gym.spaces.Discrete(4) + + # Initialize environment state. + self.reset() + + def reset(self, *, seed=None, options=None): + self._agent_pos = (1, 1) + self._ts = 0 + # Return high-level observation. + return self._agent_discrete_pos, {} + + def step(self, action): + next_pos = _get_next_pos(action, self._agent_pos) + + self._ts += 1 + + # Check if the move ends up in a wall. If so -> Ignore the move and stay + # where we are right now. + if self.map[next_pos[0]][next_pos[1]] != "W": + self._agent_pos = next_pos + + # Check if the agent has reached the global goal state. + if self.map[self._agent_pos[0]][self._agent_pos[1]] == "G": + return self._agent_discrete_pos, 10.0, True, False, {} + + # Small step penalty. + return self._agent_discrete_pos, -0.01, False, self._ts >= self.time_limit, {} + + @property + def _agent_discrete_pos(self): + x = self._agent_pos[0] + y = self._agent_pos[1] + # discrete position = row idx * columns + col idx + return x * len(self.map[0]) + y + + +class HierarchicalSixRoomEnv(MultiAgentEnv): + def __init__(self, config=None): + super().__init__() + + # User can provide a custom map or a recognized map name (small, medium, large). + self.map = config.get("custom_map", MAPS.get(config.get("map"), MAPS["small"])) + self.max_steps_low_level = config.get("max_steps_low_level", 15) + self.time_limit = config.get("time_limit", 50) + self.num_low_level_agents = config.get("num_low_level_agents", 3) + + self.agents = self.possible_agents = ["high_level_agent"] + [ + f"low_level_agent_{i}" for i in range(self.num_low_level_agents) + ] + + # Define basic observation space: Discrete, index fields. + observation_space = gym.spaces.Discrete(len(self.map) * len(self.map[0])) + # Low level agents always see where they are right now and what the target + # state should be. + low_level_observation_space = gym.spaces.Tuple( + (observation_space, observation_space) + ) + # Primitive actions: up, down, left, right. + low_level_action_space = gym.spaces.Discrete(4) + + self.observation_spaces = {"high_level_agent": observation_space} + self.observation_spaces.update( + { + f"low_level_agent_{i}": low_level_observation_space + for i in range(self.num_low_level_agents) + } + ) + self.action_spaces = { + "high_level_agent": gym.spaces.Tuple( + ( + # The new target observation. + observation_space, + # Low-level policy that should get us to the new target observation. + gym.spaces.Discrete(self.num_low_level_agents), + ) + ) + } + self.action_spaces.update( + { + f"low_level_agent_{i}": low_level_action_space + for i in range(self.num_low_level_agents) + } + ) + + # Initialize environment state. + self.reset() + + def reset(self, *, seed=None, options=None): + self._agent_pos = (1, 1) + self._low_level_steps = 0 + self._high_level_action = None + # Number of times the low-level agent reached the given target (by the high + # level agent). + self._num_targets_reached = 0 + + self._ts = 0 + + # Return high-level observation. + return { + "high_level_agent": self._agent_discrete_pos, + }, {} + + def step(self, action_dict): + self._ts += 1 + + terminateds = {"__all__": self._ts >= self.time_limit} + truncateds = {"__all__": False} + + # High-level agent acted: Set next goal and next low-level policy to use. + # Note that the agent does not move in this case and stays at its current + # location. + if "high_level_agent" in action_dict: + self._high_level_action = action_dict["high_level_agent"] + low_level_agent = f"low_level_agent_{self._high_level_action[1]}" + self._low_level_steps = 0 + # Return next low-level observation for the now-active agent. + # We want this agent to act next. + return ( + { + low_level_agent: ( + self._agent_discrete_pos, # current + self._high_level_action[0], # target + ) + }, + # Penalty for a target state that's close to the current state. + { + "high_level_agent": ( + self.eucl_dist( + self._agent_discrete_pos, + self._high_level_action[0], + self.map, + ) + / (len(self.map) ** 2 + len(self.map[0]) ** 2) ** 0.5 + ) + - 1.0, + }, + terminateds, + truncateds, + {}, + ) + # Low-level agent made a move (primitive action). + else: + assert len(action_dict) == 1 + + # Increment low-level step counter. + self._low_level_steps += 1 + + target_discrete_pos, low_level_agent = self._high_level_action + low_level_agent = f"low_level_agent_{low_level_agent}" + next_pos = _get_next_pos(action_dict[low_level_agent], self._agent_pos) + + # Check if the move ends up in a wall. If so -> Ignore the move and stay + # where we are right now. + if self.map[next_pos[0]][next_pos[1]] != "W": + self._agent_pos = next_pos + + # Check if the agent has reached the global goal state. + if self.map[self._agent_pos[0]][self._agent_pos[1]] == "G": + rewards = { + "high_level_agent": 10.0, + # +1.0 if the goal position was also the target position for the + # low level agent. + low_level_agent: float( + self._agent_discrete_pos == target_discrete_pos + ), + } + terminateds["__all__"] = True + return ( + {"high_level_agent": self._agent_discrete_pos}, + rewards, + terminateds, + truncateds, + {}, + ) + + # Low-level agent has reached its target location (given by the high-level): + # - Hand back control to high-level agent. + # - Reward low level agent and high-level agent with small rewards. + elif self._agent_discrete_pos == target_discrete_pos: + self._num_targets_reached += 1 + rewards = { + "high_level_agent": 1.0, + low_level_agent: 1.0, + } + return ( + {"high_level_agent": self._agent_discrete_pos}, + rewards, + terminateds, + truncateds, + {}, + ) + + # Low-level agent has not reached anything. + else: + # Small step penalty for low-level agent. + rewards = {low_level_agent: -0.01} + # Reached time budget -> Hand back control to high level agent. + if self._low_level_steps >= self.max_steps_low_level: + rewards["high_level_agent"] = -0.01 + return ( + {"high_level_agent": self._agent_discrete_pos}, + rewards, + terminateds, + truncateds, + {}, + ) + else: + return ( + { + low_level_agent: ( + self._agent_discrete_pos, # current + target_discrete_pos, # target + ), + }, + rewards, + terminateds, + truncateds, + {}, + ) + + @property + def _agent_discrete_pos(self): + x = self._agent_pos[0] + y = self._agent_pos[1] + # discrete position = row idx * columns + col idx + return x * len(self.map[0]) + y + + @staticmethod + def eucl_dist(pos1, pos2, map): + x1, y1 = pos1 % len(map[0]), pos1 // len(map) + x2, y2 = pos2 % len(map[0]), pos2 // len(map) + return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5 + + +def _get_next_pos(action, pos): + x, y = pos + # Up. + if action == 0: + return x - 1, y + # Down. + elif action == 1: + return x + 1, y + # Left. + elif action == 2: + return x, y - 1 + # Right. + else: + return x, y + 1 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..cacc95bd7057120324e3c5460884db7028b6306c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_cartpole.py @@ -0,0 +1,39 @@ +from gymnasium.spaces import Box +import numpy as np + +from gymnasium.envs.classic_control import CartPoleEnv + + +class StatelessCartPole(CartPoleEnv): + """Partially observable variant of the CartPole gym environment. + + https://github.com/openai/gym/blob/master/gym/envs/classic_control/ + cartpole.py + + We delete the x- and angular velocity components of the state, so that it + can only be solved by a memory enhanced model (policy). + """ + + def __init__(self, config=None): + super().__init__() + + # Fix our observation-space (remove 2 velocity components). + high = np.array( + [ + self.x_threshold * 2, + self.theta_threshold_radians * 2, + ], + dtype=np.float32, + ) + + self.observation_space = Box(low=-high, high=high, dtype=np.float32) + + def step(self, action): + next_obs, reward, done, truncated, info = super().step(action) + # next_obs is [x-pos, x-veloc, angle, angle-veloc] + return np.array([next_obs[0], next_obs[2]]), reward, done, truncated, info + + def reset(self, *, seed=None, options=None): + init_obs, init_info = super().reset(seed=seed, options=options) + # init_obs is [x-pos, x-veloc, angle, angle-veloc] + return np.array([init_obs[0], init_obs[2]]), init_info diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_pendulum.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_pendulum.py new file mode 100644 index 0000000000000000000000000000000000000000..36c6018229a5b510baeb9ea11bf0f68b1a7946ac --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/stateless_pendulum.py @@ -0,0 +1,35 @@ +from gymnasium.spaces import Box +import numpy as np + +from gymnasium.envs.classic_control import PendulumEnv + + +class StatelessPendulum(PendulumEnv): + """Partially observable variant of the Pendulum gym environment. + + https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/ + classic_control/pendulum.py + + We delete the angular velocity component of the state, so that it + can only be solved by a memory enhanced model (policy). + """ + + def __init__(self, config=None): + config = config or {} + g = config.get("g", 10.0) + + super().__init__(g=g) + + # Fix our observation-space (remove angular velocity component). + high = np.array([1.0, 1.0], dtype=np.float32) + self.observation_space = Box(low=-high, high=high, dtype=np.float32) + + def step(self, action): + next_obs, reward, done, truncated, info = super().step(action) + # next_obs is [cos(theta), sin(theta), theta-dot (angular velocity)] + return next_obs[:-1], reward, done, truncated, info + + def reset(self, *, seed=None, options=None): + init_obs, init_info = super().reset(seed=seed, options=options) + # init_obs is [cos(theta), sin(theta), theta-dot (angular velocity)] + return init_obs[:-1], init_info diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/transformed_action_space_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/transformed_action_space_env.py new file mode 100644 index 0000000000000000000000000000000000000000..1dce1051cbf30861fb196e6c8fbc0cf1522c871a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/transformed_action_space_env.py @@ -0,0 +1,61 @@ +import gymnasium as gym +from typing import Type + + +class ActionTransform(gym.ActionWrapper): + def __init__(self, env, low, high): + super().__init__(env) + self._low = low + self._high = high + self.action_space = type(env.action_space)( + self._low, self._high, env.action_space.shape, env.action_space.dtype + ) + + def action(self, action): + return (action - self._low) / (self._high - self._low) * ( + self.env.action_space.high - self.env.action_space.low + ) + self.env.action_space.low + + +def transform_action_space(env_name_or_creator) -> Type[gym.Env]: + """Wrapper for gym.Envs to have their action space transformed. + + Args: + env_name_or_creator (Union[str, Callable[]]: String specifier or + env_maker function. + + Returns: + New transformed_action_space_env function that returns an environment + wrapped by the ActionTransform wrapper. The constructor takes a + config dict with `_low` and `_high` keys specifying the new action + range (default -1.0 to 1.0). The reset of the config dict will be + passed on to the underlying/wrapped env's constructor. + + .. testcode:: + :skipif: True + + # By gym string: + pendulum_300_to_500_cls = transform_action_space("Pendulum-v1") + # Create a transformed pendulum env. + pendulum_300_to_500 = pendulum_300_to_500_cls({"_low": -15.0}) + pendulum_300_to_500.action_space + + .. testoutput:: + + gym.spaces.Box(-15.0, 1.0, (1, ), "float32") + """ + + def transformed_action_space_env(config): + if isinstance(env_name_or_creator, str): + inner_env = gym.make(env_name_or_creator) + else: + inner_env = env_name_or_creator(config) + _low = config.pop("low", -1.0) + _high = config.pop("high", 1.0) + env = ActionTransform(inner_env, _low, _high) + return env + + return transformed_action_space_env + + +TransformedActionPendulum = transform_action_space("Pendulum-v1") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/windy_maze_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/windy_maze_env.py new file mode 100644 index 0000000000000000000000000000000000000000..0a86fe4f9069a50ffb338b7ad3e7779a3260fb1f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/classes/windy_maze_env.py @@ -0,0 +1,159 @@ +import gymnasium as gym +from gymnasium.spaces import Box, Discrete, Tuple +import logging +import random + +from ray.rllib.env import MultiAgentEnv + +logger = logging.getLogger(__name__) + +# Agent has to traverse the maze from the starting position S -> F +# Observation space [x_pos, y_pos, wind_direction] +# Action space: stay still OR move in current wind direction +MAP_DATA = """ +######### +#S # +####### # + # # + # # +####### # +#F # +#########""" + + +class WindyMazeEnv(gym.Env): + def __init__(self, env_config): + self.map = [m for m in MAP_DATA.split("\n") if m] + self.x_dim = len(self.map) + self.y_dim = len(self.map[0]) + logger.info("Loaded map {} {}".format(self.x_dim, self.y_dim)) + for x in range(self.x_dim): + for y in range(self.y_dim): + if self.map[x][y] == "S": + self.start_pos = (x, y) + elif self.map[x][y] == "F": + self.end_pos = (x, y) + logger.info("Start pos {} end pos {}".format(self.start_pos, self.end_pos)) + self.observation_space = Tuple( + [ + Box(0, 100, shape=(2,)), # (x, y) + Discrete(4), # wind direction (N, E, S, W) + ] + ) + self.action_space = Discrete(2) # whether to move or not + + def reset(self, *, seed=None, options=None): + self.wind_direction = random.choice([0, 1, 2, 3]) + self.pos = self.start_pos + self.num_steps = 0 + return [[self.pos[0], self.pos[1]], self.wind_direction], {} + + def step(self, action): + if action == 1: + self.pos = self._get_new_pos(self.pos, self.wind_direction) + self.num_steps += 1 + self.wind_direction = random.choice([0, 1, 2, 3]) + at_goal = self.pos == self.end_pos + truncated = self.num_steps >= 200 + done = at_goal or truncated + return ( + [[self.pos[0], self.pos[1]], self.wind_direction], + 100 * int(at_goal), + done, + truncated, + {}, + ) + + def _get_new_pos(self, pos, direction): + if direction == 0: + new_pos = (pos[0] - 1, pos[1]) + elif direction == 1: + new_pos = (pos[0], pos[1] + 1) + elif direction == 2: + new_pos = (pos[0] + 1, pos[1]) + elif direction == 3: + new_pos = (pos[0], pos[1] - 1) + if ( + new_pos[0] >= 0 + and new_pos[0] < self.x_dim + and new_pos[1] >= 0 + and new_pos[1] < self.y_dim + and self.map[new_pos[0]][new_pos[1]] != "#" + ): + return new_pos + else: + return pos # did not move + + +class HierarchicalWindyMazeEnv(MultiAgentEnv): + def __init__(self, env_config): + super().__init__() + self.flat_env = WindyMazeEnv(env_config) + + def reset(self, *, seed=None, options=None): + self.cur_obs, infos = self.flat_env.reset() + self.current_goal = None + self.steps_remaining_at_level = None + self.num_high_level_steps = 0 + # current low level agent id. This must be unique for each high level + # step since agent ids cannot be reused. + self.low_level_agent_id = "low_level_{}".format(self.num_high_level_steps) + return { + "high_level_agent": self.cur_obs, + }, {"high_level_agent": infos} + + def step(self, action_dict): + assert len(action_dict) == 1, action_dict + if "high_level_agent" in action_dict: + return self._high_level_step(action_dict["high_level_agent"]) + else: + return self._low_level_step(list(action_dict.values())[0]) + + def _high_level_step(self, action): + logger.debug("High level agent sets goal") + self.current_goal = action + self.steps_remaining_at_level = 25 + self.num_high_level_steps += 1 + self.low_level_agent_id = "low_level_{}".format(self.num_high_level_steps) + obs = {self.low_level_agent_id: [self.cur_obs, self.current_goal]} + rew = {self.low_level_agent_id: 0} + done = truncated = {"__all__": False} + return obs, rew, done, truncated, {} + + def _low_level_step(self, action): + logger.debug("Low level agent step {}".format(action)) + self.steps_remaining_at_level -= 1 + cur_pos = tuple(self.cur_obs[0]) + goal_pos = self.flat_env._get_new_pos(cur_pos, self.current_goal) + + # Step in the actual env + f_obs, f_rew, f_terminated, f_truncated, info = self.flat_env.step(action) + new_pos = tuple(f_obs[0]) + self.cur_obs = f_obs + + # Calculate low-level agent observation and reward + obs = {self.low_level_agent_id: [f_obs, self.current_goal]} + if new_pos != cur_pos: + if new_pos == goal_pos: + rew = {self.low_level_agent_id: 1} + else: + rew = {self.low_level_agent_id: -1} + else: + rew = {self.low_level_agent_id: 0} + + # Handle env termination & transitions back to higher level. + terminated = {"__all__": False} + truncated = {"__all__": False} + if f_terminated or f_truncated: + terminated["__all__"] = f_terminated + truncated["__all__"] = f_truncated + logger.debug("high level final reward {}".format(f_rew)) + rew["high_level_agent"] = f_rew + obs["high_level_agent"] = f_obs + elif self.steps_remaining_at_level == 0: + terminated[self.low_level_agent_id] = True + truncated[self.low_level_agent_id] = False + rew["high_level_agent"] = 0 + obs["high_level_agent"] = f_obs + + return obs, rew, terminated, truncated, {self.low_level_agent_id: info} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_env_render_method.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_env_render_method.py new file mode 100644 index 0000000000000000000000000000000000000000..77216ea179cc77bf270016f7960ebda616faafa7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_env_render_method.py @@ -0,0 +1,200 @@ +"""Example of implementing a custom `render()` method for your gymnasium RL environment. + +This example: + - shows how to write a simple gym.Env class yourself, in this case a corridor env, + in which the agent starts at the left side of the corridor and has to reach the + goal state all the way at the right. + - in particular, the new class overrides the Env's `render()` method to show, how + you can write your own rendering logic. + - furthermore, we use the RLlib callbacks class introduced in this example here: + https://github.com/ray-project/ray/blob/master/rllib/examples/envs/env_rendering_and_recording.py # noqa + in order to compile videos of the worst and best performing episodes in each + iteration and log these videos to your WandB account, so you can view them. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack +--wandb-key=[your WandB API key] --wandb-project=[some WandB project name] +--wandb-run-name=[optional: WandB run name within --wandb-project]` + +In order to see the actual videos, you need to have a WandB account and provide your +API key and a project name on the command line (see above). + +Use the `--num-agents` argument to set up the env as a multi-agent env. If +`--num-agents` > 0, RLlib will simply run as many of the defined single-agent +environments in parallel and with different policies to be trained for each agent. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + + +Results to expect +----------------- +After the first training iteration, you should see the videos in your WandB account +under the provided `--wandb-project` name. Filter for "videos_best" or "videos_worst". + +Note that the default Tune TensorboardX (TBX) logger might complain about the videos +being logged. This is ok, the TBX logger will simply ignore these. The WandB logger, +however, will recognize the video tensors shaped +(1 [batch], T [video len], 3 [rgb], [height], [width]) and properly create a WandB video +object to be sent to their server. + +Your terminal output should look similar to this (the following is for a +`--num-agents=2` run; expect similar results for the other `--num-agents` +settings): ++---------------------+------------+----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +|---------------------+------------+----------------+--------+------------------+ +| PPO_env_fb1c0_00000 | TERMINATED | 127.0.0.1:8592 | 3 | 21.1876 | ++---------------------+------------+----------------+--------+------------------+ ++-------+-------------------+-------------+-------------+ +| ts | combined return | return p1 | return p0 | +|-------+-------------------+-------------+-------------| +| 12000 | 12.7655 | 7.3605 | 5.4095 | ++-------+-------------------+-------------+-------------+ +""" + +import gymnasium as gym +import numpy as np +from gymnasium.spaces import Box, Discrete +from PIL import Image, ImageDraw + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.env.multi_agent_env import make_multi_agent +from ray.rllib.examples.envs.env_rendering_and_recording import EnvRenderCallback +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray import tune + +parser = add_rllib_example_script_args( + default_iters=10, + default_reward=9.0, + default_timesteps=10000, +) +parser.set_defaults(enable_new_api_stack=True) + + +class CustomRenderedCorridorEnv(gym.Env): + """Example of a custom env, for which we specify rendering behavior.""" + + def __init__(self, config): + self.end_pos = config.get("corridor_length", 10) + self.max_steps = config.get("max_steps", 100) + self.cur_pos = 0 + self.steps = 0 + self.action_space = Discrete(2) + self.observation_space = Box(0.0, 999.0, shape=(1,), dtype=np.float32) + + def reset(self, *, seed=None, options=None): + self.cur_pos = 0.0 + self.steps = 0 + return np.array([self.cur_pos], np.float32), {} + + def step(self, action): + self.steps += 1 + assert action in [0, 1], action + if action == 0 and self.cur_pos > 0: + self.cur_pos -= 1.0 + elif action == 1: + self.cur_pos += 1.0 + truncated = self.steps >= self.max_steps + terminated = self.cur_pos >= self.end_pos + return ( + np.array([self.cur_pos], np.float32), + 10.0 if terminated else -0.1, + terminated, + truncated, + {}, + ) + + def render(self) -> np._typing.NDArray[np.uint8]: + """Implements rendering logic for this env (given the current observation). + + You should return a numpy RGB image like so: + np.array([height, width, 3], dtype=np.uint8). + + Returns: + np.ndarray: A numpy uint8 3D array (image) to render. + """ + # Image dimensions. + # Each position in the corridor is 50 pixels wide. + width = (self.end_pos + 2) * 50 + # Fixed height of the image. + height = 100 + + # Create a new image with white background + image = Image.new("RGB", (width, height), "white") + draw = ImageDraw.Draw(image) + + # Draw the corridor walls + # Grey rectangle for the corridor. + draw.rectangle([50, 30, width - 50, 70], fill="grey") + + # Draw the agent. + # Calculate the x coordinate of the agent. + agent_x = (self.cur_pos + 1) * 50 + # Blue rectangle for the agent. + draw.rectangle([agent_x + 10, 40, agent_x + 40, 60], fill="blue") + + # Draw the goal state. + # Calculate the x coordinate of the goal. + goal_x = self.end_pos * 50 + # Green rectangle for the goal state. + draw.rectangle([goal_x + 10, 40, goal_x + 40, 60], fill="green") + + # Convert the image to a uint8 numpy array. + return np.array(image, dtype=np.uint8) + + +# Create a simple multi-agent version of the above Env by duplicating the single-agent +# env n (n=num agents) times and having the agents act independently, each one in a +# different corridor. +MultiAgentCustomRenderedCorridorEnv = make_multi_agent( + lambda config: CustomRenderedCorridorEnv(config) +) + + +if __name__ == "__main__": + args = parser.parse_args() + + # The `config` arg passed into our Env's constructor (see the class' __init__ method + # above). Feel free to change these. + env_options = { + "corridor_length": 10, + "max_steps": 100, + "num_agents": args.num_agents, # <- only used by the multu-agent version. + } + + env_cls_to_use = ( + CustomRenderedCorridorEnv + if args.num_agents == 0 + else MultiAgentCustomRenderedCorridorEnv + ) + + tune.register_env("env", lambda _: env_cls_to_use(env_options)) + + # Example config switching on rendering. + base_config = ( + PPOConfig() + # Configure our env to be the above-registered one. + .environment("env") + # Plugin our env-rendering (and logging) callback. This callback class allows + # you to fully customize your rendering behavior (which workers should render, + # which episodes, which (vector) env indices, etc..). We refer to this example + # script here for further details: + # https://github.com/ray-project/ray/blob/master/rllib/examples/envs/env_rendering_and_recording.py # noqa + .callbacks(EnvRenderCallback) + ) + + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, eps, **kw: f"p{aid}", + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_gym_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_gym_env.py new file mode 100644 index 0000000000000000000000000000000000000000..2612575adb63749e536db0e3e6b79a5fbdb39247 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/custom_gym_env.py @@ -0,0 +1,162 @@ +"""Example of defining a custom gymnasium Env to be learned by an RLlib Algorithm. + +This example: + - demonstrates how to write your own (single-agent) gymnasium Env class, define its + physics and mechanics, the reward function used, the allowed actions (action space), + and the type of observations (observation space), etc.. + - shows how to configure and setup this environment class within an RLlib + Algorithm config. + - runs the experiment with the configured algo, trying to solve the environment. + +To see more details on which env we are building for this example, take a look at the +`SimpleCorridor` class defined below. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Use the `--corridor-length` option to set a custom length for the corridor. Note that +for extremely long corridors, the algorithm should take longer to learn. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see results similar to the following in your console output: + ++--------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|--------------------------------+------------+-----------------+--------+ +| PPO_SimpleCorridor_78714_00000 | TERMINATED | 127.0.0.1:85794 | 7 | ++--------------------------------+------------+-----------------+--------+ + ++------------------+-------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+-------+----------+--------------------| +| 18.3034 | 28000 | 0.908918 | 12.9676 | ++------------------+-------+----------+--------------------+ +""" +# These tags allow extracting portions of this script on Anyscale. +# ws-template-imports-start +import gymnasium as gym +from gymnasium.spaces import Discrete, Box +import numpy as np +import random + +from typing import Optional + +# ws-template-imports-end + +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env # noqa + + +parser = add_rllib_example_script_args( + default_reward=0.9, default_iters=50, default_timesteps=100000 +) +parser.add_argument( + "--corridor-length", + type=int, + default=10, + help="The length of the corridor in fields. Note that this number includes the " + "starting- and goal states.", +) + + +# These tags allow extracting portions of this script on Anyscale. +# ws-template-code-start +class SimpleCorridor(gym.Env): + """Example of a custom env in which the agent has to walk down a corridor. + + ------------ + |S........G| + ------------ + , where S is the starting position, G is the goal position, and fields with '.' + mark free spaces, over which the agent may step. The length of the above example + corridor is 10. + Allowed actions are left (0) and right (1). + The reward function is -0.01 per step taken and a uniform random value between + 0.5 and 1.5 when reaching the goal state. + + You can configure the length of the corridor via the env's config. Thus, in your + AlgorithmConfig, you can do: + `config.environment(env_config={"corridor_length": ..})`. + """ + + def __init__(self, config: Optional[dict] = None): + config = config or {} + self.end_pos = config.get("corridor_length", 7) + self.cur_pos = 0 + self.action_space = Discrete(2) + self.observation_space = Box(0.0, self.end_pos, shape=(1,), dtype=np.float32) + + def reset(self, *, seed=None, options=None): + random.seed(seed) + self.cur_pos = 0 + # Return obs and (empty) info dict. + return np.array([self.cur_pos], np.float32), {"env_state": "reset"} + + def step(self, action): + assert action in [0, 1], action + # Move left. + if action == 0 and self.cur_pos > 0: + self.cur_pos -= 1 + # Move right. + elif action == 1: + self.cur_pos += 1 + + # The environment only ever terminates when we reach the goal state. + terminated = self.cur_pos >= self.end_pos + truncated = False + # Produce a random reward from [0.5, 1.5] when we reach the goal. + reward = random.uniform(0.5, 1.5) if terminated else -0.01 + infos = {} + return ( + np.array([self.cur_pos], np.float32), + reward, + terminated, + truncated, + infos, + ) + + +# ws-template-code-end + +if __name__ == "__main__": + args = parser.parse_args() + + # Can also register the env creator function explicitly with: + # register_env("corridor-env", lambda config: SimpleCorridor()) + + # Or you can hard code certain settings into the Env's constructor (`config`). + # register_env( + # "corridor-env-w-len-100", + # lambda config: SimpleCorridor({**config, **{"corridor_length": 100}}), + # ) + + # Or allow the RLlib user to set more c'tor options via their algo config: + # config.environment(env_config={[c'tor arg name]: [value]}) + # register_env("corridor-env", lambda config: SimpleCorridor(config)) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + SimpleCorridor, # or provide the registered string: "corridor-env" + env_config={"corridor_length": args.corridor_length}, + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py new file mode 100644 index 0000000000000000000000000000000000000000..7d71ad95573f052a9ff03258086c6be787f22fbc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_connecting_to_rllib_w_tcp_client.py @@ -0,0 +1,130 @@ +"""Example of running against a TCP-connected external env performing its own inference. + +The example uses a custom EnvRunner (TcpClientInferenceEnvRunner) to allow +connections from one or more TCP clients to RLlib's EnvRunner actors, which act as +RL servers. +In this example, action inference for stepping the env is performed on the client's +side, meaning the client computes all actions itself, applies them to the env logic, +collects episodes of experiences, and sends these (in bulk) back to RLlib for training. +Also, from time to time, the updated model weights have to be sent from RLlib (server) +back to the connected clients. +Note that RLlib's new API stack does not yet support individual action requests, where +action computations happen on the RLlib (server) side. + +This example: + - demonstrates how RLlib can be hooked up to an externally running complex simulator + through TCP connections. + - shows how a custom EnvRunner subclass can be configured allowing users to + implement their own logic of connecting to external processes and customize the + messaging protocols. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --port 5555 + +Use the `--port` option to change the default port (5555) to some other value. +Make sure that you do the same on the client side. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see something like this on your terminal. Note that the dummy CartPole +client (which runs in a thread for the purpose of this example here) might throw +a disconnection error at the end, b/c RLlib closes the server socket when done training. + ++----------------------+------------+--------+------------------+ +| Trial name | status | iter | total time (s) | +| | | | | +|----------------------+------------+--------+------------------+ +| PPO_None_3358e_00000 | TERMINATED | 40 | 32.2649 | ++----------------------+------------+--------+------------------+ ++------------------------+------------------------+ +| episode_return_mean | num_env_steps_sample | +| | d_lifetime | +|-----------------------+------------------------| +| 458.68 | 160000 | ++-----------------------+------------------------+ + +From the dummy client (thread), you should see at the end: +``` +ConnectionError: Error receiving message from peer on socket ... +``` +""" +from functools import partial +import threading + +import gymnasium as gym +import numpy as np + +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.env.tcp_client_inference_env_runner import ( + _dummy_client, + TcpClientInferenceEnvRunner, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_reward=450.0, default_iters=200, default_timesteps=2000000 +) +parser.set_defaults( + enable_new_api_stack=True, + num_env_runners=1, +) +parser.add_argument( + "--port", + type=int, + default=5555, + help="The port for RLlib's EnvRunner to listen to for incoming UE5 connections. " + "You need to specify the same port inside your UE5 `RLlibClient` plugin.", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Start the dummy CartPole client in a thread (and do its thing in parallel). + client_thread = threading.Thread( + target=partial( + _dummy_client, + port=args.port + + (args.num_env_runners if args.num_env_runners is not None else 1), + ), + ) + client_thread.start() + + # Define the RLlib (server) config. + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + observation_space=gym.spaces.Box(-1.0, 1.0, (4,), np.float32), + action_space=gym.spaces.Discrete(2), + # EnvRunners listen on `port` + their worker index. + env_config={"port": args.port}, + ) + .env_runners( + # Point RLlib to the custom EnvRunner to be used here. + env_runner_cls=TcpClientInferenceEnvRunner, + ) + .training( + num_epochs=10, + vf_loss_coeff=0.01, + ) + .rl_module(model_config=DefaultModelConfig(vf_share_layers=True)) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_rendering_and_recording.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_rendering_and_recording.py new file mode 100644 index 0000000000000000000000000000000000000000..a74687ea92a5936f4f775e650b07e96b2183e152 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_rendering_and_recording.py @@ -0,0 +1,300 @@ +"""Example of using a custom Callback to render and log episode videos from a gym.Env. + +This example: + - shows how to set up your (Atari) gym.Env for human-friendly rendering inside the + `AlgorithmConfig.environment()` method. + - demonstrates how to write an RLlib custom callback class that renders all envs on + all timesteps, stores the individual images temporarily in the Episode + objects, and compiles a video from these images once the Episode terminates. + - furthermore, in each sampling cycle (iteration), the callback uses the unified + `MetricsLogger` facility - available in all RLlib components - to log the video of + the best performing and worst performing episode and sends these videos to WandB. + - configures the above callbacks class within the AlgorithmConfig. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --env [env name e.g. 'ALE/Pong-v5'] +--wandb-key=[your WandB API key] --wandb-project=[some WandB project name] +--wandb-run-name=[optional: WandB run name within --wandb-project]` + +In order to see the actual videos, you need to have a WandB account and provide your +API key and a project name on the command line (see above). To log the videos in WandB +you need to have the `wandb` and `moviepy` packages installed (`pip install wandb +moviepy`). + +Use the `--env` flag to control, which Atari env is used. Note that this example +only works with Atari envs. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + + +Results to expect +----------------- +After the first training iteration, you should see the videos in your WandB account +under the provided `--wandb-project` name. Filter for "videos_best" or "videos_worst". + +Note that the default Tune TensorboardX (TBX) logger might complain about the videos +being logged. This is ok, the TBX logger will simply ignore these. The WandB logger, +however, will recognize the video tensors shaped +(1 [batch], T [video len], 3 [rgb], [height], [width]) and properly create a WandB video +object to be sent to their server. + +Your terminal output should look similar to this: ++---------------------+----------+-----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+----------+-----------------+--------+------------------+ +| PPO_env_8d3f3_00000 | RUNNING | 127.0.0.1:89991 | 1 | 239.633 | ++---------------------+----------+-----------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| num_env_steps_sample | num_env_steps_traine | num_episodes_lifetim | +| d_lifetime | d_lifetime | e | ++------------------------+------------------------+------------------------| +| 4000 | 4000 | 24 | ++------------------------+------------------------+------------------------+ +""" +import gymnasium as gym +import numpy as np +from typing import Optional, Sequence + +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.utils.images import resize +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env +from ray import tune + +parser = add_rllib_example_script_args(default_reward=20.0) +parser.set_defaults( + enable_new_api_stack=True, + env="ale_py:ALE/Pong-v5", +) + + +class EnvRenderCallback(RLlibCallback): + """A custom callback to render the environment. + + This can be used to create videos of the episodes for some or all EnvRunners + and some or all env indices (in a vectorized env). These videos can then + be sent to e.g. WandB as shown in this example script here. + + We override the `on_episode_step` method to create a single ts render image + and temporarily store it in the Episode object. + """ + + def __init__(self, env_runner_indices: Optional[Sequence[int]] = None): + """Initializes an EnvRenderCallback instance. + + Args: + env_runner_indices: The (optional) EnvRunner indices, for this callback + should be active. If None, activates the rendering for all EnvRunners. + If a Sequence type, only renders, if the EnvRunner index is found in + `env_runner_indices`. + """ + super().__init__() + # Only render and record on certain EnvRunner indices? + self._env_runner_indices = env_runner_indices + # Per sample round (on this EnvRunner), we want to only log the best- and + # worst performing episode's videos in the custom metrics. Otherwise, too much + # data would be sent to WandB. + self.best_episode_and_return = (None, float("-inf")) + self.worst_episode_and_return = (None, float("inf")) + + def on_episode_step( + self, + *, + episode, + env_runner, + metrics_logger, + env, + env_index, + rl_module, + **kwargs, + ) -> None: + """Adds current render image to episode's temporary data. + + Note that this would work with MultiAgentEpisodes as well. + """ + # Skip, if this EnvRunner's index is not in `self._env_runner_indices`. + if ( + self._env_runner_indices is not None + and env_runner.worker_index not in self._env_runner_indices + ): + return + + # If we have a vector env, only render the sub-env at index 0. + if isinstance(env.unwrapped, gym.vector.VectorEnv): + image = env.unwrapped.envs[0].render() + # Render the gym.Env. + else: + image = env.unwrapped.render() + + # Original render images for CartPole are 400x600 (hxw). We'll downsize here to + # a very small dimension (to save space and bandwidth). + image = resize(image, 64, 96) + # For WandB videos, we need to put channels first. + image = np.transpose(image, axes=[2, 0, 1]) + # Add the compiled single-step image as temp. data to our Episode object. + # Once the episode is done, we'll compile the video from all logged images + # and log the video with the EnvRunner's `MetricsLogger.log_...()` APIs. + # See below: + # `on_episode_end()`: We compile the video and maybe store it). + # `on_sample_end()` We log the best and worst video to the `MetricsLogger`. + episode.add_temporary_timestep_data("render_images", image) + + def on_episode_end( + self, + *, + episode, + env_runner, + metrics_logger, + env, + env_index, + rl_module, + **kwargs, + ) -> None: + """Computes episode's return and compiles a video, iff best/worst in this iter. + + Note that the actual logging to the EnvRunner's MetricsLogger only happens + at the very env of sampling (when we know, which episode was the best and + worst). See `on_sample_end` for the implemented logging logic. + """ + if ( + self._env_runner_indices is not None + and env_runner.worker_index not in self._env_runner_indices + ): + return + + # Get the episode's return. + episode_return = episode.get_return() + + # Better than the best or worse than worst Episode thus far? + if ( + episode_return > self.best_episode_and_return[1] + or episode_return < self.worst_episode_and_return[1] + ): + # Pull all images from the temp. data of the episode. + images = episode.get_temporary_timestep_data("render_images") + # `images` is now a list of 3D ndarrays + + # Create a video from the images by simply stacking them AND + # adding an extra B=1 dimension. Note that Tune's WandB logger currently + # knows how to log the different data types by the following rules: + # array is shape=3D -> An image (c, h, w). + # array is shape=4D -> A batch of images (B, c, h, w). + # array is shape=5D -> A video (1, L, c, h, w), where L is the length of the + # video. + # -> Make our video ndarray a 5D one. + video = np.expand_dims(np.stack(images, axis=0), axis=0) + + # `video` is from the best episode in this cycle (iteration). + if episode_return > self.best_episode_and_return[1]: + self.best_episode_and_return = (video, episode_return) + # `video` is worst in this cycle (iteration). + else: + self.worst_episode_and_return = (video, episode_return) + + def on_sample_end( + self, + *, + env_runner, + metrics_logger, + samples, + **kwargs, + ) -> None: + """Logs the best and worst video to this EnvRunner's MetricsLogger.""" + # Best video. + if self.best_episode_and_return[0] is not None: + metrics_logger.log_value( + "episode_videos_best", + self.best_episode_and_return[0], + # Do not reduce the videos (across the various parallel EnvRunners). + # This would not make sense (mean over the pixels?). Instead, we want to + # log all best videos of all EnvRunners per iteration. + reduce=None, + # B/c we do NOT reduce over the video data (mean/min/max), we need to + # make sure the list of videos in our MetricsLogger does not grow + # infinitely and gets cleared after each `reduce()` operation, meaning + # every time, the EnvRunner is asked to send its logged metrics. + clear_on_reduce=True, + ) + self.best_episode_and_return = (None, float("-inf")) + # Worst video. + if self.worst_episode_and_return[0] is not None: + metrics_logger.log_value( + "episode_videos_worst", + self.worst_episode_and_return[0], + # Same logging options as above. + reduce=None, + clear_on_reduce=True, + ) + self.worst_episode_and_return = (None, float("inf")) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Register our environment with tune. + def _env_creator(cfg): + cfg.update({"render_mode": "rgb_array"}) + if args.env.startswith("ale_py:ALE/"): + cfg.update( + { + # Make analogous to old v4 + NoFrameskip. + "frameskip": 1, + "full_action_space": False, + "repeat_action_probability": 0.0, + } + ) + return wrap_atari_for_new_api_stack(gym.make(args.env, **cfg), framestack=4) + else: + return gym.make(args.env, **cfg) + + register_env("env", _env_creator) + + base_config = ( + get_trainable_cls(args.algo).get_default_config() + # Use the above-registered environment. + .environment("env") + # Plug in our custom callback that controls, which videos are created (best, + # and worst per sampling cycle per EnvRunner) and then logged via the + # `MetricsLogger` API. + .callbacks(EnvRenderCallback) + # Switch off RLlib's logging to avoid having the large videos show up in any log + # files. + .debugging(logger_config={"type": tune.logger.NoopLogger}) + # The following settings are beneficial for Atari-type environments. Feel free + # to adjust these when providing a non-Atari `--env` option. + .training( + lambda_=0.95, + kl_coeff=0.5, + clip_param=0.1, + vf_clip_param=10.0, + entropy_coeff=0.01, + num_epochs=10, + # Linearly adjust learning rate based on number of GPUs. + lr=0.00015 * (args.num_learners or 1), + grad_clip=100.0, + grad_clip_by="global_norm", + ) + ) + + if base_config.is_atari: + base_config.rl_module( + model_config=DefaultModelConfig( + conv_filters=[[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], + conv_activation="relu", + head_fcnet_hiddens=[256], + vf_share_layers=True, + ), + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_w_protobuf_observations.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_w_protobuf_observations.py new file mode 100644 index 0000000000000000000000000000000000000000..c1fec7c753161f9e7a89a5f1d78ce4560d47f13d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/env_w_protobuf_observations.py @@ -0,0 +1,78 @@ +"""Example of handling an Env that outputs protobuf observations. + +This example: + - demonstrates how a custom Env can use protobufs to compress its observation into + a binary format to save space and gain performance. + - shows how to use a very simple ConnectorV2 piece that translates these protobuf + binary observation strings into proper more NN-readable observations (like a 1D + float32 tensor). + +To see more details on which env we are building for this example, take a look at the +`CartPoleWithProtobufObservationSpace` class imported below. +To see more details on which ConnectorV2 piece we are plugging into the config +below, take a look at the `ProtobufCartPoleObservationDecoder` class imported below. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see results similar to the following in your console output: + ++------------------------------------------------------+------------+-----------------+ +| Trial name | status | loc | +| | | | +|------------------------------------------------------+------------+-----------------+ +| PPO_CartPoleWithProtobufObservationSpace_47dd2_00000 | TERMINATED | 127.0.0.1:67325 | ++------------------------------------------------------+------------+-----------------+ ++--------+------------------+------------------------+------------------------+ +| iter | total time (s) | episode_return_mean | num_episodes_lifetim | +| | | | e | ++--------+------------------+------------------------+------------------------+ +| 17 | 39.9011 | 513.29 | 465 | ++--------+------------------+------------------------+------------------------+ +""" +from ray.rllib.examples.connectors.classes.protobuf_cartpole_observation_decoder import ( # noqa + ProtobufCartPoleObservationDecoder, +) +from ray.rllib.examples.envs.classes.cartpole_with_protobuf_observation_space import ( + CartPoleWithProtobufObservationSpace, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + + +parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=400.0) +parser.set_defaults(enable_new_api_stack=True) + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo).get_default_config() + # Set up the env to be CartPole-v1, but with protobuf observations. + .environment(CartPoleWithProtobufObservationSpace) + # Plugin our custom ConnectorV2 piece to translate protobuf observations + # (box of dtype uint8) into NN-readible ones (1D tensor of dtype flaot32). + .env_runners( + env_to_module_connector=lambda env: ProtobufCartPoleObservationDecoder(), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/greyscale_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/greyscale_env.py new file mode 100644 index 0000000000000000000000000000000000000000..2f0e5ffc956002a9cc2fbef3569aab2911c9cafe --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/greyscale_env.py @@ -0,0 +1,127 @@ +# @OldAPIStack +""" +Example of interfacing with an environment that produces 2D observations. + +This example shows how turning 2D observations with shape (A, B) into a 3D +observations with shape (C, D, 1) can enable usage of RLlib's default models. +RLlib's default Catalog class does not provide default models for 2D observation +spaces, but it does so for 3D observations. +Therefore, one can either write a custom model or transform the 2D observations into 3D +observations. This enables RLlib to use one of the default CNN filters, even though the +original observation space of the environment does not fit them. + +This simple example should reach rewards of 50 within 150k timesteps. +""" + +from numpy import float32 +import argparse +from pettingzoo.butterfly import pistonball_v6 +from supersuit import ( + normalize_obs_v0, + dtype_v0, + color_reduction_v0, + reshape_v0, + resize_v1, +) + +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.env import PettingZooEnv +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.tune.registry import register_env +from ray import tune +from ray import air + + +parser = argparse.ArgumentParser() +parser.add_argument( + "--framework", + choices=["tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a compilation test.", +) +parser.add_argument( + "--stop-iters", type=int, default=150, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=1000000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", type=float, default=50, help="Reward at which we stop training." +) + +args = parser.parse_args() + + +# The space we down-sample and transform the greyscale pistonball images to. +# Other spaces supported by RLlib can be chosen here. +TRANSFORMED_OBS_SPACE = (42, 42, 1) + + +def env_creator(config): + env = pistonball_v6.env(n_pistons=5) + env = dtype_v0(env, dtype=float32) + # This gives us greyscale images for the color red + env = color_reduction_v0(env, mode="R") + env = normalize_obs_v0(env) + # This gives us images that are upsampled to the number of pixels in the + # default CNN filter + env = resize_v1( + env, x_size=TRANSFORMED_OBS_SPACE[0], y_size=TRANSFORMED_OBS_SPACE[1] + ) + # This gives us 3D images for which we have default filters + env = reshape_v0(env, shape=TRANSFORMED_OBS_SPACE) + return env + + +# Register env +register_env("pistonball", lambda config: PettingZooEnv(env_creator(config))) + +config = ( + PPOConfig() + .environment("pistonball", env_config={"local_ratio": 0.5}, clip_rewards=True) + .env_runners( + num_env_runners=15 if not args.as_test else 2, + num_envs_per_env_runner=1, + observation_filter="NoFilter", + rollout_fragment_length="auto", + ) + .framework("torch") + .training( + entropy_coeff=0.01, + vf_loss_coeff=0.1, + clip_param=0.1, + vf_clip_param=10.0, + num_epochs=10, + kl_coeff=0.5, + lr=0.0001, + grad_clip=100, + minibatch_size=500, + train_batch_size=5000 if not args.as_test else 1000, + model={"vf_share_layers": True}, + ) + .resources(num_gpus=1 if not args.as_test else 0) + .reporting(min_time_s_per_iteration=30) +) + +tune.Tuner( + "PPO", + param_space=config.to_dict(), + run_config=air.RunConfig( + stop={ + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + }, + verbose=2, + ), +).fit() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/unity3d_env_local.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/unity3d_env_local.py new file mode 100644 index 0000000000000000000000000000000000000000..d334125ee4e81d0021ee5a167d591ce0ed991944 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/envs/unity3d_env_local.py @@ -0,0 +1,213 @@ +# @OldAPIStack + +""" +Example of running an RLlib Algorithm against a locally running Unity3D editor +instance (available as Unity3DEnv inside RLlib). +For a distributed cloud setup example with Unity, +see `examples/envs/external_envs/unity3d_[server|client].py` + +To run this script against a local Unity3D engine: +1) Install Unity3D and `pip install mlagents`. + +2) Open the Unity3D Editor and load an example scene from the following + ml-agents pip package location: + `.../ml-agents/Project/Assets/ML-Agents/Examples/` + This script supports the `3DBall`, `3DBallHard`, `SoccerStrikersVsGoalie`, + `Tennis`, and `Walker` examples. + Specify the game you chose on your command line via e.g. `--env 3DBall`. + Feel free to add more supported examples here. + +3) Then run this script (you will have to press Play in your Unity editor + at some point to start the game and the learning process): +$ python unity3d_env_local.py --env 3DBall --stop-reward [..] + [--framework=torch]? +""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import check_learning_achieved + +parser = argparse.ArgumentParser() +parser.add_argument( + "--env", + type=str, + default="3DBall", + choices=[ + "3DBall", + "3DBallHard", + "GridFoodCollector", + "Pyramids", + "SoccerStrikersVsGoalie", + "SoccerTwos", + "Sorter", + "Tennis", + "VisualHallway", + "Walker", + ], + help="The name of the Env to run in the Unity3D editor: `3DBall(Hard)?|" + "Pyramids|GridFoodCollector|SoccerStrikersVsGoalie|Sorter|Tennis|" + "VisualHallway|Walker` (feel free to add more and PR!)", +) +parser.add_argument( + "--file-name", + type=str, + default=None, + help="The Unity3d binary (compiled) game, e.g. " + "'/home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64'. Use `None` for " + "a currently running Unity3D editor.", +) +parser.add_argument( + "--from-checkpoint", + type=str, + default=None, + help="Full path to a checkpoint file for restoring a previously saved " + "Algorithm state.", +) +parser.add_argument("--num-workers", type=int, default=0) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=9999, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=10000000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", + type=float, + default=9999.0, + help="Reward at which we stop training.", +) +parser.add_argument( + "--horizon", + type=int, + default=3000, + help="The max. number of `step()`s for any episode (per agent) before " + "it'll be reset again automatically.", +) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) + +if __name__ == "__main__": + ray.init() + + args = parser.parse_args() + + tune.register_env( + "unity3d", + lambda c: Unity3DEnv( + file_name=c["file_name"], + no_graphics=(args.env != "VisualHallway" and c["file_name"] is not None), + episode_horizon=c["episode_horizon"], + ), + ) + + # Get policies (different agent types; "behaviors" in MLAgents) and + # the mappings from individual agents to Policies. + policies, policy_mapping_fn = Unity3DEnv.get_policy_configs_for_game(args.env) + + config = ( + PPOConfig() + .environment( + "unity3d", + env_config={ + "file_name": args.file_name, + "episode_horizon": args.horizon, + }, + ) + .framework("tf" if args.env != "Pyramids" else "torch") + # For running in editor, force to use just one Worker (we only have + # one Unity running)! + .env_runners( + num_env_runners=args.num_workers if args.file_name else 0, + rollout_fragment_length=200, + ) + .training( + lr=0.0003, + lambda_=0.95, + gamma=0.99, + minibatch_size=256, + train_batch_size=4000, + num_epochs=20, + clip_param=0.2, + model={"fcnet_hiddens": [512, 512]}, + ) + .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ) + + # Switch on Curiosity based exploration for Pyramids env + # (not solvable otherwise). + if args.env == "Pyramids": + config.env_runners( + exploration_config={ + "type": "Curiosity", + "eta": 0.1, + "lr": 0.001, + # No actual feature net: map directly from observations to feature + # vector (linearly). + "feature_net_config": { + "fcnet_hiddens": [], + "fcnet_activation": "relu", + }, + "sub_exploration": { + "type": "StochasticSampling", + }, + "forward_net_activation": "relu", + "inverse_net_activation": "relu", + } + ) + elif args.env == "GridFoodCollector": + config.training( + model={ + "conv_filters": [[16, [4, 4], 2], [32, [4, 4], 2], [256, [10, 10], 1]], + } + ) + elif args.env == "Sorter": + config.training(model={"use_attention": True}) + + stop = { + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + # Run the experiment. + results = tune.Tuner( + "PPO", + param_space=config.to_dict(), + run_config=air.RunConfig( + stop=stop, + verbose=1, + checkpoint_config=air.CheckpointConfig( + checkpoint_frequency=5, + checkpoint_at_end=True, + ), + ), + ).fit() + + # And check the results. + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93e13b085d4244d66aa7a768b29c94a580c4ce72 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce5f1d0a9331ed13b9e92b9759ab4dcd3ddf2763 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_w_connector.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_w_connector.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33cf5aeb9e0be4e53391219cf7315554ca9e75db Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_w_connector.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_attention.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_attention.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91927a2cf3353c0d03c695ed048e768b5eff431e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_attention.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_lstm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_lstm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03980cf40af8effca84d2a49ac10f0ab2dc4a7b8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/__pycache__/policy_inference_after_training_with_lstm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training.py new file mode 100644 index 0000000000000000000000000000000000000000..4ece833c3c53b0c19e7e8f9cef0ddadf35659044 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training.py @@ -0,0 +1,188 @@ +"""Example on how to compute actions in production on an already trained policy. + +This example uses the simplest setup possible: An RLModule (policy net) recovered +from a checkpoint and a manual env-loop (CartPole-v1). No ConnectorV2s or EnvRunners are +used in this example. + +This example: + - shows how to use an already existing checkpoint to extract a single-agent RLModule + from (our policy network). + - shows how to setup this recovered policy net for action computations (with or + without using exploration). + - shows have the policy run through a very simple gymnasium based env-loop, w/o + using RLlib's ConnectorV2s or EnvRunners. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --stop-reward=200.0` + +Use the `--explore-during-inference` option to switch on exploratory behavior +during inference. Normally, you should not explore during inference, though, +unless your environment has a stochastic optimal solution. +Use the `--num-episodes-during-inference=[int]` option to set the number of +episodes to run through during the inference phase using the restored RLModule. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +Note that the shown GPU settings in this script also work in case you are not +running via tune, but instead are using the `--no-tune` command line option. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. + + +Results to expect +----------------- + +For the training step - depending on your `--stop-reward` setting, you should see +something similar to this: + +Number of trials: 1/1 (1 TERMINATED) ++-----------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-----------------------------+------------+-----------------+--------+ +| PPO_CartPole-v1_6660c_00000 | TERMINATED | 127.0.0.1:43566 | 8 | ++-----------------------------+------------+-----------------+--------+ ++------------------+------------------------+------------------------+ +| total time (s) | num_env_steps_sample | num_env_steps_traine | +| | d_lifetime | d_lifetime | ++------------------+------------------------+------------------------+ +| 21.0283 | 32000 | 32000 | ++------------------+------------------------+------------------------+ + +Then, after restoring the RLModule for the inference phase, your output should +look similar to: + +Training completed. Restoring new RLModule for action inference. +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Episode done: Total reward = 500.0 +Done performing action inference through 10 Episodes +""" +import gymnasium as gym +import numpy as np +import os + +from ray.rllib.core import DEFAULT_MODULE_ID +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import convert_to_numpy, softmax +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +torch, _ = try_import_torch() + +parser = add_rllib_example_script_args(default_reward=200.0) +parser.set_defaults( + # Make sure that - by default - we produce checkpoints during training. + checkpoint_freq=1, + checkpoint_at_end=True, + # Use CartPole-v1 by default. + env="CartPole-v1", + # Script only runs on new API stack. + enable_new_api_stack=True, +) +parser.add_argument( + "--explore-during-inference", + action="store_true", + help="Whether the trained policy should use exploration during action " + "inference.", +) +parser.add_argument( + "--num-episodes-during-inference", + type=int, + default=10, + help="Number of episodes to do inference over (after restoring from a checkpoint).", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + base_config = get_trainable_cls(args.algo).get_default_config() + + print("Training policy until desired reward/timesteps/iterations. ...") + results = run_rllib_example_script_experiment(base_config, args) + + print("Training completed. Restoring new RLModule for action inference.") + # Get the last checkpoint from the above training run. + best_result = results.get_best_result( + metric=f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}", mode="max" + ) + # Create new RLModule and restore its state from the last algo checkpoint. + # Note that the checkpoint for the RLModule can be found deeper inside the algo + # checkpoint's subdirectories ([algo dir] -> "learner/" -> "module_state/" -> + # "[module ID]): + rl_module = RLModule.from_checkpoint( + os.path.join( + best_result.checkpoint.path, + "learner_group", + "learner", + "rl_module", + DEFAULT_MODULE_ID, + ) + ) + + # Create an env to do inference in. + env = gym.make(args.env) + obs, info = env.reset() + + num_episodes = 0 + episode_return = 0.0 + + while num_episodes < args.num_episodes_during_inference: + # Compute an action using a B=1 observation "batch". + input_dict = {Columns.OBS: torch.from_numpy(obs).unsqueeze(0)} + # No exploration. + if not args.explore_during_inference: + rl_module_out = rl_module.forward_inference(input_dict) + # Using exploration. + else: + rl_module_out = rl_module.forward_exploration(input_dict) + + # For discrete action spaces used here, normally, an RLModule "only" + # produces action logits, from which we then have to sample. + # However, you can also write custom RLModules that output actions + # directly, performing the sampling step already inside their + # `forward_...()` methods. + logits = convert_to_numpy(rl_module_out[Columns.ACTION_DIST_INPUTS]) + # Perform the sampling step in numpy for simplicity. + action = np.random.choice(env.action_space.n, p=softmax(logits[0])) + # Send the computed action `a` to the env. + obs, reward, terminated, truncated, _ = env.step(action) + episode_return += reward + # Is the episode `done`? -> Reset. + if terminated or truncated: + print(f"Episode done: Total reward = {episode_return}") + obs, info = env.reset() + num_episodes += 1 + episode_return = 0.0 + + print(f"Done performing action inference through {num_episodes} Episodes") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_w_connector.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_w_connector.py new file mode 100644 index 0000000000000000000000000000000000000000..f19505e22dd4963a55157489bb3c26909501675f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_w_connector.py @@ -0,0 +1,274 @@ +"""Example on how to compute actions in production on an already trained policy. + +This example uses a more complex setup including a gymnasium environment, an +RLModule (one or more neural networks/policies), an env-to-module/module-to-env +ConnectorV2 pair, and an Episode object to store the ongoing episode in. +The RLModule contains an LSTM that requires its own previous STATE_OUT as new input +at every episode step to compute a new action. + +This example: + - shows how to use an already existing checkpoint to extract a single-agent RLModule + from (our policy network). + - shows how to setup this recovered policy net for action computations (with or + without using exploration). + - shows how to create a more complex env-loop in which the action-computing RLModule + requires its own previous state outputs as new input and how to use RLlib's Episode + APIs to achieve this. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --stop-reward=200.0` + +Use the `--explore-during-inference` option to switch on exploratory behavior +during inference. Normally, you should not explore during inference, though, +unless your environment has a stochastic optimal solution. +Use the `--num-episodes-during-inference=[int]` option to set the number of +episodes to run through during the inference phase using the restored RLModule. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +Note that the shown GPU settings in this script also work in case you are not +running via tune, but instead are using the `--no-tune` command line option. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. + + +Results to expect +----------------- + +For the training step - depending on your `--stop-reward` setting, you should see +something similar to this: + +Number of trials: 1/1 (1 TERMINATED) ++--------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|--------------------------------+------------+-----------------+--------+ +| PPO_stateless-cart_cc890_00000 | TERMINATED | 127.0.0.1:72238 | 7 | ++--------------------------------+------------+-----------------+--------+ ++------------------+------------------------+------------------------+ +| total time (s) | num_env_steps_sample | num_env_steps_traine | +| | d_lifetime | d_lifetime | ++------------------+------------------------+------------------------+ +| 31.9655 | 28000 | 28000 | ++------------------+------------------------+------------------------+ + +Then, after restoring the RLModule for the inference phase, your output should +look similar to: + +Training completed. Creating an env-loop for inference ... +Env ... +Env-to-module ConnectorV2 ... +RLModule restored ... +Module-to-env ConnectorV2 ... +Episode done: Total reward = 103.0 +Episode done: Total reward = 90.0 +Episode done: Total reward = 100.0 +Episode done: Total reward = 111.0 +Episode done: Total reward = 85.0 +Episode done: Total reward = 90.0 +Episode done: Total reward = 100.0 +Episode done: Total reward = 102.0 +Episode done: Total reward = 97.0 +Episode done: Total reward = 81.0 +Done performing action inference through 10 Episodes +""" +import os + +from ray.rllib.connectors.env_to_module import EnvToModulePipeline +from ray.rllib.connectors.module_to_env import ModuleToEnvPipeline +from ray.rllib.core import ( + COMPONENT_ENV_RUNNER, + COMPONENT_ENV_TO_MODULE_CONNECTOR, + COMPONENT_MODULE_TO_ENV_CONNECTOR, + COMPONENT_LEARNER_GROUP, + COMPONENT_LEARNER, + COMPONENT_RL_MODULE, + DEFAULT_MODULE_ID, +) +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.env.single_agent_episode import SingleAgentEpisode +from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +torch, _ = try_import_torch() + + +def _env_creator(cfg): + return StatelessCartPole(cfg) + + +register_env("stateless-cart", _env_creator) + + +parser = add_rllib_example_script_args(default_reward=200.0) +parser.set_defaults( + # Script only runs on new API stack. + enable_new_api_stack=True, + # Make sure that - by default - we produce checkpoints during training. + checkpoint_freq=1, + checkpoint_at_end=True, + # Use StatelessCartPole by default. + env="stateless-cart", +) +parser.add_argument( + "--explore-during-inference", + action="store_true", + help="Whether the trained policy should use exploration during action " + "inference.", +) +parser.add_argument( + "--num-episodes-during-inference", + type=int, + default=10, + help="Number of episodes to do inference over (after restoring from a checkpoint).", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .training( + num_epochs=6, + lr=0.0003, + vf_loss_coeff=0.01, + ) + # Add an LSTM setup to the default RLModule used. + .rl_module(model_config=DefaultModelConfig(use_lstm=True)) + ) + + print("Training LSTM-policy until desired reward/timesteps/iterations. ...") + results = run_rllib_example_script_experiment(base_config, args) + + # Get the last checkpoint from the above training run. + metric_key = metric = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" + best_result = results.get_best_result(metric=metric_key, mode="max") + + print( + "Training completed (R=" + f"{best_result.metrics[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}). " + "Creating an env-loop for inference ..." + ) + + print("Env ...", end="") + env = _env_creator(base_config.env_config) + print(" ok") + + # Create the env-to-module pipeline from the checkpoint. + print("Restore env-to-module connector from checkpoint ...", end="") + env_to_module = EnvToModulePipeline.from_checkpoint( + os.path.join( + best_result.checkpoint.path, + COMPONENT_ENV_RUNNER, + COMPONENT_ENV_TO_MODULE_CONNECTOR, + ) + ) + print(" ok") + + print("Restore RLModule from checkpoint ...", end="") + # Create RLModule from a checkpoint. + rl_module = RLModule.from_checkpoint( + os.path.join( + best_result.checkpoint.path, + COMPONENT_LEARNER_GROUP, + COMPONENT_LEARNER, + COMPONENT_RL_MODULE, + DEFAULT_MODULE_ID, + ) + ) + print(" ok") + + # For the module-to-env pipeline, we will use the convenient config utility. + print("Restore module-to-env connector from checkpoint ...", end="") + module_to_env = ModuleToEnvPipeline.from_checkpoint( + os.path.join( + best_result.checkpoint.path, + COMPONENT_ENV_RUNNER, + COMPONENT_MODULE_TO_ENV_CONNECTOR, + ) + ) + print(" ok") + + # Now our setup is complete: + # [gym.Env] -> env-to-module -> [RLModule] -> module-to-env -> [gym.Env] ... repeat + num_episodes = 0 + + obs, _ = env.reset() + episode = SingleAgentEpisode( + observations=[obs], + observation_space=env.observation_space, + action_space=env.action_space, + ) + + while num_episodes < args.num_episodes_during_inference: + shared_data = {} + input_dict = env_to_module( + episodes=[episode], # ConnectorV2 pipelines operate on lists of episodes. + rl_module=rl_module, + explore=args.explore_during_inference, + shared_data=shared_data, + ) + # No exploration. + if not args.explore_during_inference: + rl_module_out = rl_module.forward_inference(input_dict) + # Using exploration. + else: + rl_module_out = rl_module.forward_exploration(input_dict) + + to_env = module_to_env( + batch=rl_module_out, + episodes=[episode], # ConnectorV2 pipelines operate on lists of episodes. + rl_module=rl_module, + explore=args.explore_during_inference, + shared_data=shared_data, + ) + # Send the computed action to the env. Note that the RLModule and the + # connector pipelines work on batched data (B=1 in this case), whereas the Env + # is not vectorized here, so we need to use `action[0]`. + action = to_env.pop(Columns.ACTIONS)[0] + obs, reward, terminated, truncated, _ = env.step(action) + # Keep our `SingleAgentEpisode` instance updated at all times. + episode.add_env_step( + obs, + action, + reward, + terminated=terminated, + truncated=truncated, + # Same here: [0] b/c RLModule output is batched (w/ B=1). + extra_model_outputs={k: v[0] for k, v in to_env.items()}, + ) + + # Is the episode `done`? -> Reset. + if episode.is_done: + print(f"Episode done: Total reward = {episode.get_return()}") + obs, info = env.reset() + episode = SingleAgentEpisode( + observations=[obs], + observation_space=env.observation_space, + action_space=env.action_space, + ) + num_episodes += 1 + + print(f"Done performing action inference through {num_episodes} Episodes") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_attention.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..1e594066d18f583219f665f081cd293dfd9825e0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_attention.py @@ -0,0 +1,196 @@ +# @OldAPIStack +""" +Example showing how you can use your trained policy for inference +(computing actions) in an environment. + +Includes options for LSTM-based models (--use-lstm), attention-net models +(--use-attention), and plain (non-recurrent) models. +""" +import argparse +import gymnasium as gym +import numpy as np +import os + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.tune.registry import get_trainable_cls + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." +) +parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument( + "--prev-n-actions", + type=int, + default=0, + help="Feed n most recent actions to the attention net as part of its input.", +) +parser.add_argument( + "--prev-n-rewards", + type=int, + default=0, + help="Feed n most recent rewards to the attention net as part of its input.", +) +parser.add_argument( + "--stop-iters", + type=int, + default=200, + help="Number of iterations to train before we do inference.", +) +parser.add_argument( + "--stop-timesteps", + type=int, + default=100000, + help="Number of timesteps to train before we do inference.", +) +parser.add_argument( + "--stop-reward", + type=float, + default=150.0, + help="Reward at which we stop training before we do inference.", +) +parser.add_argument( + "--explore-during-inference", + action="store_true", + help="Whether the trained policy should use exploration during action " + "inference.", +) +parser.add_argument( + "--num-episodes-during-inference", + type=int, + default=10, + help="Number of episodes to do inference over after training.", +) + +if __name__ == "__main__": + args = parser.parse_args() + + ray.init(num_cpus=args.num_cpus or None) + + config = ( + get_trainable_cls(args.run) + .get_default_config() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .environment("FrozenLake-v1") + # Run with tracing enabled for tf2? + .framework(args.framework) + .training( + model={ + "use_attention": True, + "attention_num_transformer_units": 1, + "attention_use_n_prev_actions": args.prev_n_actions, + "attention_use_n_prev_rewards": args.prev_n_rewards, + "attention_dim": 32, + "attention_memory_inference": 10, + "attention_memory_training": 10, + }, + ) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + print("Training policy until desired reward/timesteps/iterations. ...") + tuner = tune.Tuner( + args.run, + param_space=config, + run_config=air.RunConfig( + stop=stop, + verbose=2, + checkpoint_config=air.CheckpointConfig( + checkpoint_frequency=1, + checkpoint_at_end=True, + ), + ), + ) + results = tuner.fit() + + print("Training completed. Restoring new Algorithm for action inference.") + # Get the last checkpoint from the above training run. + checkpoint = results.get_best_result().checkpoint + # Create new Algorithm and restore its state from the last checkpoint. + algo = Algorithm.from_checkpoint(checkpoint) + + # Create the env to do inference in. + env = gym.make("FrozenLake-v1") + obs, info = env.reset() + + # In case the model needs previous-reward/action inputs, keep track of + # these via these variables here (we'll have to pass them into the + # compute_actions methods below). + init_prev_a = prev_a = None + init_prev_r = prev_r = None + + # Set attention net's initial internal state. + num_transformers = config["model"]["attention_num_transformer_units"] + memory_inference = config["model"]["attention_memory_inference"] + attention_dim = config["model"]["attention_dim"] + init_state = state = [ + np.zeros([memory_inference, attention_dim], np.float32) + for _ in range(num_transformers) + ] + # Do we need prev-action/reward as part of the input? + if args.prev_n_actions: + init_prev_a = prev_a = np.array([0] * args.prev_n_actions) + if args.prev_n_rewards: + init_prev_r = prev_r = np.array([0.0] * args.prev_n_rewards) + + num_episodes = 0 + + while num_episodes < args.num_episodes_during_inference: + # Compute an action (`a`). + a, state_out, _ = algo.compute_single_action( + observation=obs, + state=state, + prev_action=prev_a, + prev_reward=prev_r, + explore=args.explore_during_inference, + policy_id="default_policy", # <- default value + ) + # Send the computed action `a` to the env. + obs, reward, done, truncated, _ = env.step(a) + # Is the episode `done`? -> Reset. + if done: + obs, info = env.reset() + num_episodes += 1 + state = init_state + prev_a = init_prev_a + prev_r = init_prev_r + # Episode is still ongoing -> Continue. + else: + # Append the just received state-out (most recent timestep) to the + # cascade (memory) of our state-ins and drop the oldest state-in. + state = [ + np.concatenate([state[i], [state_out[i]]], axis=0)[1:] + for i in range(num_transformers) + ] + if init_prev_a is not None: + prev_a = a + if init_prev_r is not None: + prev_r = reward + + algo.stop() + + ray.shutdown() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_lstm.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..39c6ac6aa58874f83c49a6a539feae1fda5189c0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/inference/policy_inference_after_training_with_lstm.py @@ -0,0 +1,185 @@ +# @OldAPIStack +""" +Example showing how you can use your trained policy for inference +(computing actions) in an environment. + +Includes options for LSTM-based models (--use-lstm), attention-net models +(--use-attention), and plain (non-recurrent) models. +""" +import argparse +import gymnasium as gym +import numpy as np +import os + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.tune.registry import get_trainable_cls + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." +) +parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument( + "--prev-action", + action="store_true", + help="Feed most recent action to the LSTM as part of its input.", +) +parser.add_argument( + "--prev-reward", + action="store_true", + help="Feed most recent reward to the LSTM as part of its input.", +) +parser.add_argument( + "--stop-iters", + type=int, + default=2, + help="Number of iterations to train before we do inference.", +) +parser.add_argument( + "--stop-timesteps", + type=int, + default=100000, + help="Number of timesteps to train before we do inference.", +) +parser.add_argument( + "--stop-reward", + type=float, + default=0.8, + help="Reward at which we stop training before we do inference.", +) +parser.add_argument( + "--explore-during-inference", + action="store_true", + help="Whether the trained policy should use exploration during action " + "inference.", +) +parser.add_argument( + "--num-episodes-during-inference", + type=int, + default=10, + help="Number of episodes to do inference over after training.", +) + +if __name__ == "__main__": + args = parser.parse_args() + + ray.init(num_cpus=args.num_cpus or None) + + config = ( + get_trainable_cls(args.run) + .get_default_config() + .api_stack( + enable_env_runner_and_connector_v2=False, + enable_rl_module_and_learner=False, + ) + .environment("FrozenLake-v1") + # Run with tracing enabled for tf2? + .framework(args.framework) + .training( + model={ + "use_lstm": True, + "lstm_cell_size": 256, + "lstm_use_prev_action": args.prev_action, + "lstm_use_prev_reward": args.prev_reward, + }, + ) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + print("Training policy until desired reward/timesteps/iterations. ...") + tuner = tune.Tuner( + args.run, + param_space=config, + run_config=air.RunConfig( + stop=stop, + verbose=2, + checkpoint_config=air.CheckpointConfig( + checkpoint_frequency=1, + checkpoint_at_end=True, + ), + ), + ) + results = tuner.fit() + + print("Training completed. Restoring new Algorithm for action inference.") + # Get the last checkpoint from the above training run. + checkpoint = results.get_best_result().checkpoint + # Create new Algorithm from the last checkpoint. + algo = Algorithm.from_checkpoint(checkpoint) + + # Create the env to do inference in. + env = gym.make("FrozenLake-v1") + obs, info = env.reset() + + # In case the model needs previous-reward/action inputs, keep track of + # these via these variables here (we'll have to pass them into the + # compute_actions methods below). + init_prev_a = prev_a = None + init_prev_r = prev_r = None + + # Set LSTM's initial internal state. + lstm_cell_size = config["model"]["lstm_cell_size"] + # range(2) b/c h- and c-states of the LSTM. + if algo.config.enable_rl_module_and_learner: + init_state = state = algo.get_policy().model.get_initial_state() + else: + init_state = state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)] + # Do we need prev-action/reward as part of the input? + if args.prev_action: + init_prev_a = prev_a = 0 + if args.prev_reward: + init_prev_r = prev_r = 0.0 + + num_episodes = 0 + + while num_episodes < args.num_episodes_during_inference: + # Compute an action (`a`). + a, state_out, _ = algo.compute_single_action( + observation=obs, + state=state, + prev_action=prev_a, + prev_reward=prev_r, + explore=args.explore_during_inference, + policy_id="default_policy", # <- default value + ) + # Send the computed action `a` to the env. + obs, reward, done, truncated, info = env.step(a) + # Is the episode `done`? -> Reset. + if done: + obs, info = env.reset() + num_episodes += 1 + state = init_state + prev_a = init_prev_a + prev_r = init_prev_r + # Episode is still ongoing -> Continue. + else: + state = state_out + if init_prev_a is not None: + prev_a = a + if init_prev_r is not None: + prev_r = reward + + algo.stop() + + ray.shutdown() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/quadx_waypoints.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/quadx_waypoints.py new file mode 100644 index 0000000000000000000000000000000000000000..bbd7082c92e0f289273c29abcb6f046880b94d65 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/quadx_waypoints.py @@ -0,0 +1,131 @@ +"""An example showing how to use PyFlyt gymnasium environment to train a UAV to +reach waypoints. + +For more infos about the PyFlyt gymnasium environment see the GitHub Repository: +https://github.com/jjshoots/PyFlyt/tree/master/PyFlyt + +This example + - Runs a single-agent `PyFlyt/QuadX-Waypoints-v1` experiment. + - Uses a gymnasium reward wrapper for reward scaling. + - Stops the experiment, if either `--stop-iters` (default is 200) or + `--stop-reward` (default is 90.0) is reached. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Control the number of environments per `EnvRunner` via `--num-envs-per-env-runner`. +This will increase sampling speed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` which should allow you to set breakpoints +anywhere in the RLlib code and have the execution stop there for inspection +and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" +import gymnasium as gym +import sys + +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + TRAINING_ITERATION_TIMER, +) +from ray.tune.registry import get_trainable_cls, register_env + +sys.setrecursionlimit(3000) + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=100000, + default_reward=90.0, +) +parser.add_argument( + "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." +) +parser.add_argument("--env-name", type=str, default="quadx_waypoints") +parser.add_argument("--num-envs-per-env-runner", type=int, default=4) + + +class RewardWrapper(gym.RewardWrapper): + def __init__(self, env): + super().__init__(env) + + def reward(self, reward): + # Scale rewards: + if reward >= 99.0 or reward <= -99.0: + return reward / 10 + return reward + + +def create_quadx_waypoints_env(env_config): + import PyFlyt.gym_envs # noqa + from PyFlyt.gym_envs import FlattenWaypointEnv + + env = gym.make("PyFlyt/QuadX-Waypoints-v1") + # Wrap Environment to use max 10 and -10 for rewards + env = RewardWrapper(env) + + return FlattenWaypointEnv(env, context_length=1) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Register the environment with tune. + register_env(args.env_name, env_creator=create_quadx_waypoints_env) + + # Get the algorithm class to use for training. + algo_cls = get_trainable_cls(args.run) + config = ( + algo_cls.get_default_config() + .environment(env=args.env_name) + .env_runners( + num_envs_per_env_runner=args.num_envs_per_env_runner, + ) + .reporting(min_time_s_per_iteration=0.1) + ) + + # If PPO set additional configurations. + if args.run == "PPO": + config.rl_module( + model_config={ + "fcnet_hiddens": [32], + "fcnet_activation": "linear", + "vf_share_layers": True, + } + ) + config.training( + minibatch_size=128, + train_batch_size_per_learner=10000, + ) + # If IMPALA set additional arguments. + elif args.run == "IMPALA": + config.env_runners(num_env_runners=2) + config.learners(num_gpus_per_learner=0) + config.training(vf_loss_coeff=0.01) + + # Set the stopping arguments. + EPISODE_RETURN_MEAN_KEY = f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}" + stop = { + TRAINING_ITERATION_TIMER: args.stop_iters, + EPISODE_RETURN_MEAN_KEY: args.stop_reward, + } + + # Run the experiment. + run_rllib_example_script_experiment( + config, + args, + stop=stop, + success_metric={ + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + }, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/replay_buffer_api.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/replay_buffer_api.py new file mode 100644 index 0000000000000000000000000000000000000000..5d87a5ef5cd3bd0e7cf238ca23559c21780f28a8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/replay_buffer_api.py @@ -0,0 +1,82 @@ +# @OldAPIStack + +# __sphinx_doc_replay_buffer_api_example_script_begin__ +"""Simple example of how to modify replay buffer behaviour. + +We modify DQN to utilize prioritized replay but supplying it with the +PrioritizedMultiAgentReplayBuffer instead of the standard MultiAgentReplayBuffer. +This is possible because DQN uses the DQN training iteration function, +which includes and a priority update, given that a fitting buffer is provided. +""" + +import argparse + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.algorithms.dqn import DQNConfig +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME +from ray.rllib.utils.replay_buffers.replay_buffer import StorageUnit + +tf1, tf, tfv = try_import_tf() + +parser = argparse.ArgumentParser() + +parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument( + "--stop-iters", type=int, default=50, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." +) + +if __name__ == "__main__": + args = parser.parse_args() + + ray.init(num_cpus=args.num_cpus or None) + + # This is where we add prioritized experiences replay + # The training iteration function that is used by DQN already includes a priority + # update step. + replay_buffer_config = { + "type": "MultiAgentPrioritizedReplayBuffer", + # Although not necessary, we can modify the default constructor args of + # the replay buffer here + "prioritized_replay_alpha": 0.5, + "storage_unit": StorageUnit.SEQUENCES, + "replay_burn_in": 20, + "zero_init_states": True, + } + + config = ( + DQNConfig() + .environment("CartPole-v1") + .framework(framework=args.framework) + .env_runners(num_env_runners=4) + .training( + model=dict(use_lstm=True, lstm_cell_size=64, max_seq_len=20), + replay_buffer_config=replay_buffer_config, + ) + ) + + stop_config = { + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + TRAINING_ITERATION: args.stop_iters, + } + + results = tune.Tuner( + config.algo_class, + param_space=config, + run_config=air.RunConfig(stop=stop_config), + ).fit() + + ray.shutdown() + +# __sphinx_doc_replay_buffer_api_example_script_end__