diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b438bd320d0a8f070bf7d15872f769629b15261 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/attention_net_supervised.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/attention_net_supervised.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48260b4c6158dc7b249898c973dc099f52f09af4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/attention_net_supervised.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aefc2eec8fba96a6097ef3fe195d62db4b8f2f99 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole_embeddings_learnt_by_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole_embeddings_learnt_by_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..360409a994ecd5bf5043b606a655f3c0ea9edf63 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/__pycache__/parametric_actions_cartpole_embeddings_learnt_by_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/attention_net_supervised.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/attention_net_supervised.py new file mode 100644 index 0000000000000000000000000000000000000000..2c0f13f506aa5f5a04d31a323df6b6213c3a83e1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/attention_net_supervised.py @@ -0,0 +1,77 @@ +# @OldAPIStack +from gymnasium.spaces import Box, Discrete +import numpy as np + +from rllib.models.tf.attention_net import TrXLNet +from ray.rllib.utils.framework import try_import_tf + +tf1, tf, tfv = try_import_tf() + + +def bit_shift_generator(seq_length, shift, batch_size): + while True: + values = np.array([0.0, 1.0], dtype=np.float32) + seq = np.random.choice(values, (batch_size, seq_length, 1)) + targets = np.squeeze(np.roll(seq, shift, axis=1).astype(np.int32)) + targets[:, :shift] = 0 + yield seq, targets + + +def train_loss(targets, outputs): + loss = tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=targets, logits=outputs + ) + return tf.reduce_mean(loss) + + +def train_bit_shift(seq_length, num_iterations, print_every_n): + + optimizer = tf.keras.optimizers.Adam(1e-3) + + model = TrXLNet( + observation_space=Box(low=0, high=1, shape=(1,), dtype=np.int32), + action_space=Discrete(2), + num_outputs=2, + model_config={"max_seq_len": seq_length}, + name="trxl", + num_transformer_units=1, + attention_dim=10, + num_heads=5, + head_dim=20, + position_wise_mlp_dim=20, + ) + + shift = 10 + train_batch = 10 + test_batch = 100 + data_gen = bit_shift_generator(seq_length, shift=shift, batch_size=train_batch) + test_gen = bit_shift_generator(seq_length, shift=shift, batch_size=test_batch) + + @tf.function + def update_step(inputs, targets): + model_out = model( + {"obs": inputs}, + state=[tf.reshape(inputs, [-1, seq_length, 1])], + seq_lens=np.full(shape=(train_batch,), fill_value=seq_length), + ) + optimizer.minimize( + lambda: train_loss(targets, model_out), lambda: model.trainable_variables + ) + + for i, (inputs, targets) in zip(range(num_iterations), data_gen): + inputs_in = np.reshape(inputs, [-1, 1]) + targets_in = np.reshape(targets, [-1]) + update_step(tf.convert_to_tensor(inputs_in), tf.convert_to_tensor(targets_in)) + + if i % print_every_n == 0: + test_inputs, test_targets = next(test_gen) + print(i, train_loss(test_targets, model(test_inputs))) + + +if __name__ == "__main__": + tf.enable_eager_execution() + train_bit_shift( + seq_length=20, + num_iterations=2000, + print_every_n=200, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59950126ca06e56a2a63d3ce95b8dd5e8b6abc03 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/action_mask_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/action_mask_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa029edb18b6df56671a8b30785523c3a1684026 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/action_mask_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_dist.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_dist.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccec20ec142064b3dc4c79e15769e705ffa5f835 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_dist.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbeddd4ac780e3a5504e1dff315125e4e94eda10 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/autoregressive_action_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/centralized_critic_models.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/centralized_critic_models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cdd082e4d30dfd3f20fef75a3814e40f2f899b53 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/centralized_critic_models.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/custom_loss_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/custom_loss_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0fdc2d3fd872ee292b4400bdf27cd5fb237e853 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/custom_loss_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/fast_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/fast_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e0fbcc16b181879e9b52f308cc1a00db6975ed1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/fast_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_encoder.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_encoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d154e78d6b69a77212bc5fadb8c061dbf2a5550 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_encoder.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_with_lstm_models.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_with_lstm_models.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97a4ac655037036643ffda7249c792a775c0034e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/mobilenet_v2_with_lstm_models.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/parametric_actions_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/parametric_actions_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66e7a1d9783dec67869e833f80b29b23f5c16ae9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/parametric_actions_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/shared_weights_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/shared_weights_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bae081f58355bf3a6ad4eec059ef463b49e87e7c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/shared_weights_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/simple_rpg_model.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/simple_rpg_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f4d783014f4a0982aeabe00f44dc5e0139301e5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/__pycache__/simple_rpg_model.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/action_mask_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/action_mask_model.py new file mode 100644 index 0000000000000000000000000000000000000000..92fe99e53847ef5f801cc94dc7e30a2f28838ffc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/action_mask_model.py @@ -0,0 +1,126 @@ +# @OldAPIStack +from gymnasium.spaces import Dict + +from ray.rllib.models.tf.fcnet import FullyConnectedNetwork +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.torch_utils import FLOAT_MIN + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class ActionMaskModel(TFModelV2): + """Model that handles simple discrete action masking. + + This assumes the outputs are logits for a single Categorical action dist. + Getting this to work with a more complex output (e.g., if the action space + is a tuple of several distributions) is also possible but left as an + exercise to the reader. + """ + + def __init__( + self, obs_space, action_space, num_outputs, model_config, name, **kwargs + ): + + orig_space = getattr(obs_space, "original_space", obs_space) + assert ( + isinstance(orig_space, Dict) + and "action_mask" in orig_space.spaces + and "observations" in orig_space.spaces + ) + + super().__init__(obs_space, action_space, num_outputs, model_config, name) + + self.internal_model = FullyConnectedNetwork( + orig_space["observations"], + action_space, + num_outputs, + model_config, + name + "_internal", + ) + + # disable action masking --> will likely lead to invalid actions + self.no_masking = model_config["custom_model_config"].get("no_masking", False) + + def forward(self, input_dict, state, seq_lens): + # Extract the available actions tensor from the observation. + action_mask = input_dict["obs"]["action_mask"] + + # Compute the unmasked logits. + logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]}) + + # If action masking is disabled, directly return unmasked logits + if self.no_masking: + return logits, state + + # Convert action_mask into a [0.0 || -inf]-type mask. + inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min) + masked_logits = logits + inf_mask + + # Return masked logits. + return masked_logits, state + + def value_function(self): + return self.internal_model.value_function() + + +class TorchActionMaskModel(TorchModelV2, nn.Module): + """PyTorch version of above ActionMaskingModel.""" + + def __init__( + self, + obs_space, + action_space, + num_outputs, + model_config, + name, + **kwargs, + ): + orig_space = getattr(obs_space, "original_space", obs_space) + assert ( + isinstance(orig_space, Dict) + and "action_mask" in orig_space.spaces + and "observations" in orig_space.spaces + ) + + TorchModelV2.__init__( + self, obs_space, action_space, num_outputs, model_config, name, **kwargs + ) + nn.Module.__init__(self) + + self.internal_model = TorchFC( + orig_space["observations"], + action_space, + num_outputs, + model_config, + name + "_internal", + ) + + # disable action masking --> will likely lead to invalid actions + self.no_masking = False + if "no_masking" in model_config["custom_model_config"]: + self.no_masking = model_config["custom_model_config"]["no_masking"] + + def forward(self, input_dict, state, seq_lens): + # Extract the available actions tensor from the observation. + action_mask = input_dict["obs"]["action_mask"] + + # Compute the unmasked logits. + logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]}) + + # If action masking is disabled, directly return unmasked logits + if self.no_masking: + return logits, state + + # Convert action_mask into a [0.0 || -inf]-type mask. + inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN) + masked_logits = logits + inf_mask + + # Return masked logits. + return masked_logits, state + + def value_function(self): + return self.internal_model.value_function() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..fd8f2d53f7789d5a1f670361c10b041123664ab7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py @@ -0,0 +1,149 @@ +# @OldAPIStack +from ray.rllib.models.tf.tf_action_dist import Categorical, ActionDistribution +from ray.rllib.models.torch.torch_action_dist import ( + TorchCategorical, + TorchDistributionWrapper, +) +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class BinaryAutoregressiveDistribution(ActionDistribution): + """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)""" + + def deterministic_sample(self): + # First, sample a1. + a1_dist = self._a1_distribution() + a1 = a1_dist.deterministic_sample() + + # Sample a2 conditioned on a1. + a2_dist = self._a2_distribution(a1) + a2 = a2_dist.deterministic_sample() + self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2) + + # Return the action tuple. + return (a1, a2) + + def sample(self): + # First, sample a1. + a1_dist = self._a1_distribution() + a1 = a1_dist.sample() + + # Sample a2 conditioned on a1. + a2_dist = self._a2_distribution(a1) + a2 = a2_dist.sample() + self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2) + + # Return the action tuple. + return (a1, a2) + + def logp(self, actions): + a1, a2 = actions[:, 0], actions[:, 1] + a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) + a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec]) + return Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2) + + def sampled_action_logp(self): + return self._action_logp + + def entropy(self): + a1_dist = self._a1_distribution() + a2_dist = self._a2_distribution(a1_dist.sample()) + return a1_dist.entropy() + a2_dist.entropy() + + def kl(self, other): + a1_dist = self._a1_distribution() + a1_terms = a1_dist.kl(other._a1_distribution()) + + a1 = a1_dist.sample() + a2_terms = self._a2_distribution(a1).kl(other._a2_distribution(a1)) + return a1_terms + a2_terms + + def _a1_distribution(self): + BATCH = tf.shape(self.inputs)[0] + a1_logits, _ = self.model.action_model([self.inputs, tf.zeros((BATCH, 1))]) + a1_dist = Categorical(a1_logits) + return a1_dist + + def _a2_distribution(self, a1): + a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) + _, a2_logits = self.model.action_model([self.inputs, a1_vec]) + a2_dist = Categorical(a2_logits) + return a2_dist + + @staticmethod + def required_model_output_shape(action_space, model_config): + return 16 # controls model output feature vector size + + +class TorchBinaryAutoregressiveDistribution(TorchDistributionWrapper): + """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)""" + + def deterministic_sample(self): + # First, sample a1. + a1_dist = self._a1_distribution() + a1 = a1_dist.deterministic_sample() + + # Sample a2 conditioned on a1. + a2_dist = self._a2_distribution(a1) + a2 = a2_dist.deterministic_sample() + self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2) + + # Return the action tuple. + return (a1, a2) + + def sample(self): + # First, sample a1. + a1_dist = self._a1_distribution() + a1 = a1_dist.sample() + + # Sample a2 conditioned on a1. + a2_dist = self._a2_distribution(a1) + a2 = a2_dist.sample() + self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2) + + # Return the action tuple. + return (a1, a2) + + def logp(self, actions): + a1, a2 = actions[:, 0], actions[:, 1] + a1_vec = torch.unsqueeze(a1.float(), 1) + a1_logits, a2_logits = self.model.action_module(self.inputs, a1_vec) + return TorchCategorical(a1_logits).logp(a1) + TorchCategorical(a2_logits).logp( + a2 + ) + + def sampled_action_logp(self): + return self._action_logp + + def entropy(self): + a1_dist = self._a1_distribution() + a2_dist = self._a2_distribution(a1_dist.sample()) + return a1_dist.entropy() + a2_dist.entropy() + + def kl(self, other): + a1_dist = self._a1_distribution() + a1_terms = a1_dist.kl(other._a1_distribution()) + + a1 = a1_dist.sample() + a2_terms = self._a2_distribution(a1).kl(other._a2_distribution(a1)) + return a1_terms + a2_terms + + def _a1_distribution(self): + BATCH = self.inputs.shape[0] + zeros = torch.zeros((BATCH, 1)).to(self.inputs.device) + a1_logits, _ = self.model.action_module(self.inputs, zeros) + a1_dist = TorchCategorical(a1_logits) + return a1_dist + + def _a2_distribution(self, a1): + a1_vec = torch.unsqueeze(a1.float(), 1) + _, a2_logits = self.model.action_module(self.inputs, a1_vec) + a2_dist = TorchCategorical(a2_logits) + return a2_dist + + @staticmethod + def required_model_output_shape(action_space, model_config): + return 16 # controls model output feature vector size diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_model.py new file mode 100644 index 0000000000000000000000000000000000000000..8b71e5ab9dc28bbb9bd5f39767350ab51639ea27 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/autoregressive_action_model.py @@ -0,0 +1,162 @@ +# @OldAPIStack +from gymnasium.spaces import Discrete, Tuple + +from ray.rllib.models.tf.misc import normc_initializer +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.torch.misc import normc_initializer as normc_init_torch +from ray.rllib.models.torch.misc import SlimFC +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class AutoregressiveActionModel(TFModelV2): + """Implements the `.action_model` branch required above.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + super(AutoregressiveActionModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name + ) + if action_space != Tuple([Discrete(2), Discrete(2)]): + raise ValueError("This model only supports the [2, 2] action space") + + # Inputs + obs_input = tf.keras.layers.Input(shape=obs_space.shape, name="obs_input") + a1_input = tf.keras.layers.Input(shape=(1,), name="a1_input") + ctx_input = tf.keras.layers.Input(shape=(num_outputs,), name="ctx_input") + + # Output of the model (normally 'logits', but for an autoregressive + # dist this is more like a context/feature layer encoding the obs) + context = tf.keras.layers.Dense( + num_outputs, + name="hidden", + activation=tf.nn.tanh, + kernel_initializer=normc_initializer(1.0), + )(obs_input) + + # V(s) + value_out = tf.keras.layers.Dense( + 1, + name="value_out", + activation=None, + kernel_initializer=normc_initializer(0.01), + )(context) + + # P(a1 | obs) + a1_logits = tf.keras.layers.Dense( + 2, + name="a1_logits", + activation=None, + kernel_initializer=normc_initializer(0.01), + )(ctx_input) + + # P(a2 | a1) + # --note: typically you'd want to implement P(a2 | a1, obs) as follows: + # a2_context = tf.keras.layers.Concatenate(axis=1)( + # [ctx_input, a1_input]) + a2_context = a1_input + a2_hidden = tf.keras.layers.Dense( + 16, + name="a2_hidden", + activation=tf.nn.tanh, + kernel_initializer=normc_initializer(1.0), + )(a2_context) + a2_logits = tf.keras.layers.Dense( + 2, + name="a2_logits", + activation=None, + kernel_initializer=normc_initializer(0.01), + )(a2_hidden) + + # Base layers + self.base_model = tf.keras.Model(obs_input, [context, value_out]) + self.base_model.summary() + + # Autoregressive action sampler + self.action_model = tf.keras.Model( + [ctx_input, a1_input], [a1_logits, a2_logits] + ) + self.action_model.summary() + + def forward(self, input_dict, state, seq_lens): + context, self._value_out = self.base_model(input_dict["obs"]) + return context, state + + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + +class TorchAutoregressiveActionModel(TorchModelV2, nn.Module): + """PyTorch version of the AutoregressiveActionModel above.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + TorchModelV2.__init__( + self, obs_space, action_space, num_outputs, model_config, name + ) + nn.Module.__init__(self) + + if action_space != Tuple([Discrete(2), Discrete(2)]): + raise ValueError("This model only supports the [2, 2] action space") + + # Output of the model (normally 'logits', but for an autoregressive + # dist this is more like a context/feature layer encoding the obs) + self.context_layer = SlimFC( + in_size=obs_space.shape[0], + out_size=num_outputs, + initializer=normc_init_torch(1.0), + activation_fn=nn.Tanh, + ) + + # V(s) + self.value_branch = SlimFC( + in_size=num_outputs, + out_size=1, + initializer=normc_init_torch(0.01), + activation_fn=None, + ) + + # P(a1 | obs) + self.a1_logits = SlimFC( + in_size=num_outputs, + out_size=2, + activation_fn=None, + initializer=normc_init_torch(0.01), + ) + + class _ActionModel(nn.Module): + def __init__(self): + nn.Module.__init__(self) + self.a2_hidden = SlimFC( + in_size=1, + out_size=16, + activation_fn=nn.Tanh, + initializer=normc_init_torch(1.0), + ) + self.a2_logits = SlimFC( + in_size=16, + out_size=2, + activation_fn=None, + initializer=normc_init_torch(0.01), + ) + + def forward(self_, ctx_input, a1_input): + a1_logits = self.a1_logits(ctx_input) + a2_logits = self_.a2_logits(self_.a2_hidden(a1_input)) + return a1_logits, a2_logits + + # P(a2 | a1) + # --note: typically you'd want to implement P(a2 | a1, obs) as follows: + # a2_context = tf.keras.layers.Concatenate(axis=1)( + # [ctx_input, a1_input]) + self.action_module = _ActionModel() + + self._context = None + + def forward(self, input_dict, state, seq_lens): + self._context = self.context_layer(input_dict["obs"]) + return self._context, state + + def value_function(self): + return torch.reshape(self.value_branch(self._context), [-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/centralized_critic_models.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/centralized_critic_models.py new file mode 100644 index 0000000000000000000000000000000000000000..5ccc4448e5428f466ea79d858a9e5cc835dc7959 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/centralized_critic_models.py @@ -0,0 +1,182 @@ +# @OldAPIStack +from gymnasium.spaces import Box + +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.fcnet import FullyConnectedNetwork +from ray.rllib.models.torch.misc import SlimFC +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class CentralizedCriticModel(TFModelV2): + """Multi-agent model that implements a centralized value function.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + super(CentralizedCriticModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name + ) + # Base of the model + self.model = FullyConnectedNetwork( + obs_space, action_space, num_outputs, model_config, name + ) + + # Central VF maps (obs, opp_obs, opp_act) -> vf_pred + obs = tf.keras.layers.Input(shape=(6,), name="obs") + opp_obs = tf.keras.layers.Input(shape=(6,), name="opp_obs") + opp_act = tf.keras.layers.Input(shape=(2,), name="opp_act") + concat_obs = tf.keras.layers.Concatenate(axis=1)([obs, opp_obs, opp_act]) + central_vf_dense = tf.keras.layers.Dense( + 16, activation=tf.nn.tanh, name="c_vf_dense" + )(concat_obs) + central_vf_out = tf.keras.layers.Dense(1, activation=None, name="c_vf_out")( + central_vf_dense + ) + self.central_vf = tf.keras.Model( + inputs=[obs, opp_obs, opp_act], outputs=central_vf_out + ) + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + return self.model.forward(input_dict, state, seq_lens) + + def central_value_function(self, obs, opponent_obs, opponent_actions): + return tf.reshape( + self.central_vf( + [obs, opponent_obs, tf.one_hot(tf.cast(opponent_actions, tf.int32), 2)] + ), + [-1], + ) + + @override(ModelV2) + def value_function(self): + return self.model.value_function() # not used + + +class YetAnotherCentralizedCriticModel(TFModelV2): + """Multi-agent model that implements a centralized value function. + + It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the + former of which can be used for computing actions (i.e., decentralized + execution), and the latter for optimization (i.e., centralized learning). + + This model has two parts: + - An action model that looks at just 'own_obs' to compute actions + - A value model that also looks at the 'opponent_obs' / 'opponent_action' + to compute the value (it does this by using the 'obs_flat' tensor). + """ + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + super(YetAnotherCentralizedCriticModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name + ) + + self.action_model = FullyConnectedNetwork( + Box(low=0, high=1, shape=(6,)), # one-hot encoded Discrete(6) + action_space, + num_outputs, + model_config, + name + "_action", + ) + + self.value_model = FullyConnectedNetwork( + obs_space, action_space, 1, model_config, name + "_vf" + ) + + def forward(self, input_dict, state, seq_lens): + self._value_out, _ = self.value_model( + {"obs": input_dict["obs_flat"]}, state, seq_lens + ) + return self.action_model({"obs": input_dict["obs"]["own_obs"]}, state, seq_lens) + + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + +class TorchCentralizedCriticModel(TorchModelV2, nn.Module): + """Multi-agent model that implements a centralized VF.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + TorchModelV2.__init__( + self, obs_space, action_space, num_outputs, model_config, name + ) + nn.Module.__init__(self) + + # Base of the model + self.model = TorchFC(obs_space, action_space, num_outputs, model_config, name) + + # Central VF maps (obs, opp_obs, opp_act) -> vf_pred + input_size = 6 + 6 + 2 # obs + opp_obs + opp_act + self.central_vf = nn.Sequential( + SlimFC(input_size, 16, activation_fn=nn.Tanh), + SlimFC(16, 1), + ) + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + model_out, _ = self.model(input_dict, state, seq_lens) + return model_out, [] + + def central_value_function(self, obs, opponent_obs, opponent_actions): + input_ = torch.cat( + [ + obs, + opponent_obs, + torch.nn.functional.one_hot(opponent_actions.long(), 2).float(), + ], + 1, + ) + return torch.reshape(self.central_vf(input_), [-1]) + + @override(ModelV2) + def value_function(self): + return self.model.value_function() # not used + + +class YetAnotherTorchCentralizedCriticModel(TorchModelV2, nn.Module): + """Multi-agent model that implements a centralized value function. + + It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the + former of which can be used for computing actions (i.e., decentralized + execution), and the latter for optimization (i.e., centralized learning). + + This model has two parts: + - An action model that looks at just 'own_obs' to compute actions + - A value model that also looks at the 'opponent_obs' / 'opponent_action' + to compute the value (it does this by using the 'obs_flat' tensor). + """ + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + TorchModelV2.__init__( + self, obs_space, action_space, num_outputs, model_config, name + ) + nn.Module.__init__(self) + + self.action_model = TorchFC( + Box(low=0, high=1, shape=(6,)), # one-hot encoded Discrete(6) + action_space, + num_outputs, + model_config, + name + "_action", + ) + + self.value_model = TorchFC( + obs_space, action_space, 1, model_config, name + "_vf" + ) + self._model_in = None + + def forward(self, input_dict, state, seq_lens): + # Store model-input for possible `value_function()` call. + self._model_in = [input_dict["obs_flat"], state, seq_lens] + return self.action_model({"obs": input_dict["obs"]["own_obs"]}, state, seq_lens) + + def value_function(self): + value_out, _ = self.value_model( + {"obs": self._model_in[0]}, self._model_in[1], self._model_in[2] + ) + return torch.reshape(value_out, [-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/custom_loss_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/custom_loss_model.py new file mode 100644 index 0000000000000000000000000000000000000000..8e3636c0c6520d25c6f27e46bd1ad56add30500b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/custom_loss_model.py @@ -0,0 +1,137 @@ +import numpy as np + +from ray.rllib.models.modelv2 import ModelV2, restore_original_dimensions +from ray.rllib.models.tf.tf_action_dist import Categorical +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.fcnet import FullyConnectedNetwork +from ray.rllib.models.torch.torch_action_dist import TorchCategorical +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.offline import JsonReader + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class CustomLossModel(TFModelV2): + """Custom model that adds an imitation loss on top of the policy loss.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + super().__init__(obs_space, action_space, num_outputs, model_config, name) + + self.fcnet = FullyConnectedNetwork( + self.obs_space, self.action_space, num_outputs, model_config, name="fcnet" + ) + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + # Delegate to our FCNet. + return self.fcnet(input_dict, state, seq_lens) + + @override(ModelV2) + def value_function(self): + # Delegate to our FCNet. + return self.fcnet.value_function() + + @override(ModelV2) + def custom_loss(self, policy_loss, loss_inputs): + # Create a new input reader per worker. + reader = JsonReader(self.model_config["custom_model_config"]["input_files"]) + input_ops = reader.tf_input_ops() + + # Define a secondary loss by building a graph copy with weight sharing. + obs = restore_original_dimensions( + tf.cast(input_ops["obs"], tf.float32), self.obs_space + ) + logits, _ = self.forward({"obs": obs}, [], None) + + # Compute the IL loss. + action_dist = Categorical(logits, self.model_config) + self.policy_loss = policy_loss + self.imitation_loss = tf.reduce_mean(-action_dist.logp(input_ops["actions"])) + return policy_loss + 10 * self.imitation_loss + + def metrics(self): + return { + "policy_loss": self.policy_loss, + "imitation_loss": self.imitation_loss, + } + + +class TorchCustomLossModel(TorchModelV2, nn.Module): + """PyTorch version of the CustomLossModel above.""" + + def __init__( + self, obs_space, action_space, num_outputs, model_config, name, input_files + ): + super().__init__(obs_space, action_space, num_outputs, model_config, name) + nn.Module.__init__(self) + + self.input_files = input_files + # Create a new input reader per worker. + self.reader = JsonReader(self.input_files) + self.fcnet = TorchFC( + self.obs_space, self.action_space, num_outputs, model_config, name="fcnet" + ) + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + # Delegate to our FCNet. + return self.fcnet(input_dict, state, seq_lens) + + @override(ModelV2) + def value_function(self): + # Delegate to our FCNet. + return self.fcnet.value_function() + + @override(ModelV2) + def custom_loss(self, policy_loss, loss_inputs): + """Calculates a custom loss on top of the given policy_loss(es). + + Args: + policy_loss (List[TensorType]): The list of already calculated + policy losses (as many as there are optimizers). + loss_inputs: Struct of np.ndarrays holding the + entire train batch. + + Returns: + List[TensorType]: The altered list of policy losses. In case the + custom loss should have its own optimizer, make sure the + returned list is one larger than the incoming policy_loss list. + In case you simply want to mix in the custom loss into the + already calculated policy losses, return a list of altered + policy losses (as done in this example below). + """ + # Get the next batch from our input files. + batch = self.reader.next() + + # Define a secondary loss by building a graph copy with weight sharing. + obs = restore_original_dimensions( + torch.from_numpy(batch["obs"]).float().to(policy_loss[0].device), + self.obs_space, + tensorlib="torch", + ) + logits, _ = self.forward({"obs": obs}, [], None) + + # Compute the IL loss. + action_dist = TorchCategorical(logits, self.model_config) + imitation_loss = torch.mean( + -action_dist.logp( + torch.from_numpy(batch["actions"]).to(policy_loss[0].device) + ) + ) + self.imitation_loss_metric = imitation_loss.item() + self.policy_loss_metric = np.mean([loss.item() for loss in policy_loss]) + + # Add the imitation loss to each already calculated policy loss term. + # Alternatively (if custom loss has its own optimizer): + # return policy_loss + [10 * self.imitation_loss] + return [loss_ + 10 * imitation_loss for loss_ in policy_loss] + + def metrics(self): + return { + "policy_loss": self.policy_loss_metric, + "imitation_loss": self.imitation_loss_metric, + } diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/fast_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/fast_model.py new file mode 100644 index 0000000000000000000000000000000000000000..99ac7e83a7de342b24aeca1949a77730598c38e9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/fast_model.py @@ -0,0 +1,80 @@ +# @OldAPIStack +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.torch.misc import SlimFC +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class FastModel(TFModelV2): + """An example for a non-Keras ModelV2 in tf that learns a single weight. + + Defines all network architecture in `forward` (not `__init__` as it's + usually done for Keras-style TFModelV2s). + """ + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + super().__init__(obs_space, action_space, num_outputs, model_config, name) + # Have we registered our vars yet (see `forward`)? + self._registered = False + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + with tf1.variable_scope("model", reuse=tf1.AUTO_REUSE): + bias = tf1.get_variable( + dtype=tf.float32, + name="bias", + initializer=tf.keras.initializers.Zeros(), + shape=(), + ) + output = bias + tf.zeros([tf.shape(input_dict["obs"])[0], self.num_outputs]) + self._value_out = tf.reduce_mean(output, -1) # fake value + + if not self._registered: + self.register_variables( + tf1.get_collection( + tf1.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+" + ) + ) + self._registered = True + + return output, [] + + @override(ModelV2) + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + +class TorchFastModel(TorchModelV2, nn.Module): + """Torch version of FastModel (tf).""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + TorchModelV2.__init__( + self, obs_space, action_space, num_outputs, model_config, name + ) + nn.Module.__init__(self) + + self.bias = nn.Parameter( + torch.tensor([0.0], dtype=torch.float32, requires_grad=True) + ) + + # Only needed to give some params to the optimizer (even though, + # they are never used anywhere). + self.dummy_layer = SlimFC(1, 1) + self._output = None + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + self._output = self.bias + torch.zeros( + size=(input_dict["obs"].shape[0], self.num_outputs) + ).to(self.bias.device) + return self._output, [] + + @override(ModelV2) + def value_function(self): + assert self._output is not None, "must call forward first!" + return torch.reshape(torch.mean(self._output, -1), [-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..34baf73f4ef53d433d3ee439f7b6a41a2560cdc5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py @@ -0,0 +1,48 @@ +# @OldAPIStack +""" +This file implements a MobileNet v2 Encoder. +It uses MobileNet v2 to encode images into a latent space of 1000 dimensions. + +Depending on the experiment, the MobileNet v2 encoder layers can be frozen or +unfrozen. This is controlled by the `freeze` parameter in the config. + +This is an example of how a pre-trained neural network can be used as an encoder +in RLlib. You can modify this example to accommodate your own encoder network or +other pre-trained networks. +""" + +from ray.rllib.core.models.base import Encoder, ENCODER_OUT +from ray.rllib.core.models.configs import ModelConfig +from ray.rllib.core.models.torch.base import TorchModel +from ray.rllib.utils.framework import try_import_torch + +torch, nn = try_import_torch() + +MOBILENET_INPUT_SHAPE = (3, 224, 224) + + +class MobileNetV2EncoderConfig(ModelConfig): + # MobileNet v2 has a flat output with a length of 1000. + output_dims = (1000,) + freeze = True + + def build(self, framework): + assert framework == "torch", "Unsupported framework `{}`!".format(framework) + return MobileNetV2Encoder(self) + + +class MobileNetV2Encoder(TorchModel, Encoder): + """A MobileNet v2 encoder for RLlib.""" + + def __init__(self, config): + super().__init__(config) + self.net = torch.hub.load( + "pytorch/vision:v0.6.0", "mobilenet_v2", pretrained=True + ) + if config.freeze: + # We don't want to train this encoder, so freeze its parameters! + for p in self.net.parameters(): + p.requires_grad = False + + def _forward(self, input_dict, **kwargs): + return {ENCODER_OUT: (self.net(input_dict["obs"]))} diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py new file mode 100644 index 0000000000000000000000000000000000000000..fc0b310c4ed05a45d7aed65d695c474055df82f5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py @@ -0,0 +1,160 @@ +# @OldAPIStack +import numpy as np + +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.recurrent_net import RecurrentNetwork +from ray.rllib.models.torch.misc import SlimFC +from ray.rllib.models.torch.recurrent_net import RecurrentNetwork as TorchRNN +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class MobileV2PlusRNNModel(RecurrentNetwork): + """A conv. + recurrent keras net example using a pre-trained MobileNet.""" + + def __init__( + self, obs_space, action_space, num_outputs, model_config, name, cnn_shape + ): + + super(MobileV2PlusRNNModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name + ) + + self.cell_size = 16 + visual_size = cnn_shape[0] * cnn_shape[1] * cnn_shape[2] + + state_in_h = tf.keras.layers.Input(shape=(self.cell_size,), name="h") + state_in_c = tf.keras.layers.Input(shape=(self.cell_size,), name="c") + seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32) + + inputs = tf.keras.layers.Input(shape=(None, visual_size), name="visual_inputs") + + input_visual = inputs + input_visual = tf.reshape( + input_visual, [-1, cnn_shape[0], cnn_shape[1], cnn_shape[2]] + ) + cnn_input = tf.keras.layers.Input(shape=cnn_shape, name="cnn_input") + + cnn_model = tf.keras.applications.mobilenet_v2.MobileNetV2( + alpha=1.0, + include_top=True, + weights=None, + input_tensor=cnn_input, + pooling=None, + ) + vision_out = cnn_model(input_visual) + vision_out = tf.reshape( + vision_out, [-1, tf.shape(inputs)[1], vision_out.shape.as_list()[-1]] + ) + + lstm_out, state_h, state_c = tf.keras.layers.LSTM( + self.cell_size, return_sequences=True, return_state=True, name="lstm" + )( + inputs=vision_out, + mask=tf.sequence_mask(seq_in), + initial_state=[state_in_h, state_in_c], + ) + + # Postprocess LSTM output with another hidden layer and compute values. + logits = tf.keras.layers.Dense( + self.num_outputs, activation=tf.keras.activations.linear, name="logits" + )(lstm_out) + values = tf.keras.layers.Dense(1, activation=None, name="values")(lstm_out) + + # Create the RNN model + self.rnn_model = tf.keras.Model( + inputs=[inputs, seq_in, state_in_h, state_in_c], + outputs=[logits, values, state_h, state_c], + ) + self.rnn_model.summary() + + @override(RecurrentNetwork) + def forward_rnn(self, inputs, state, seq_lens): + model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] + state) + return model_out, [h, c] + + @override(ModelV2) + def get_initial_state(self): + return [ + np.zeros(self.cell_size, np.float32), + np.zeros(self.cell_size, np.float32), + ] + + @override(ModelV2) + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + +class TorchMobileV2PlusRNNModel(TorchRNN, nn.Module): + """A conv. + recurrent torch net example using a pre-trained MobileNet.""" + + def __init__( + self, obs_space, action_space, num_outputs, model_config, name, cnn_shape + ): + + TorchRNN.__init__( + self, obs_space, action_space, num_outputs, model_config, name + ) + nn.Module.__init__(self) + + self.lstm_state_size = 16 + self.cnn_shape = list(cnn_shape) + self.visual_size_in = cnn_shape[0] * cnn_shape[1] * cnn_shape[2] + # MobileNetV2 has a flat output of (1000,). + self.visual_size_out = 1000 + + # Load the MobileNetV2 from torch.hub. + self.cnn_model = torch.hub.load( + "pytorch/vision:v0.6.0", "mobilenet_v2", pretrained=True + ) + + self.lstm = nn.LSTM( + self.visual_size_out, self.lstm_state_size, batch_first=True + ) + + # Postprocess LSTM output with another hidden layer and compute values. + self.logits = SlimFC(self.lstm_state_size, self.num_outputs) + self.value_branch = SlimFC(self.lstm_state_size, 1) + # Holds the current "base" output (before logits layer). + self._features = None + + @override(TorchRNN) + def forward_rnn(self, inputs, state, seq_lens): + # Create image dims. + vision_in = torch.reshape(inputs, [-1] + self.cnn_shape) + vision_out = self.cnn_model(vision_in) + # Flatten. + vision_out_time_ranked = torch.reshape( + vision_out, [inputs.shape[0], inputs.shape[1], vision_out.shape[-1]] + ) + if len(state[0].shape) == 2: + state[0] = state[0].unsqueeze(0) + state[1] = state[1].unsqueeze(0) + # Forward through LSTM. + self._features, [h, c] = self.lstm(vision_out_time_ranked, state) + # Forward LSTM out through logits layer and value layer. + logits = self.logits(self._features) + return logits, [h.squeeze(0), c.squeeze(0)] + + @override(ModelV2) + def get_initial_state(self): + # Place hidden states on same device as model. + h = [ + list(self.cnn_model.modules())[-1] + .weight.new(1, self.lstm_state_size) + .zero_() + .squeeze(0), + list(self.cnn_model.modules())[-1] + .weight.new(1, self.lstm_state_size) + .zero_() + .squeeze(0), + ] + return h + + @override(ModelV2) + def value_function(self): + assert self._features is not None, "must call forward() first" + return torch.reshape(self.value_branch(self._features), [-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/neural_computer.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/neural_computer.py new file mode 100644 index 0000000000000000000000000000000000000000..d863f71e62d7426c360cffc73042db3c393a7f78 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/neural_computer.py @@ -0,0 +1,247 @@ +# @OldAPIStack +from collections import OrderedDict +import gymnasium as gym +from typing import Union, Dict, List, Tuple + +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.misc import SlimFC +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import ModelConfigDict, TensorType + +try: + from dnc import DNC +except ModuleNotFoundError: + print("dnc module not found. Did you forget to 'pip install dnc'?") + raise + +torch, nn = try_import_torch() + + +class DNCMemory(TorchModelV2, nn.Module): + """Differentiable Neural Computer wrapper around ixaxaar's DNC implementation, + see https://github.com/ixaxaar/pytorch-dnc""" + + DEFAULT_CONFIG = { + "dnc_model": DNC, + # Number of controller hidden layers + "num_hidden_layers": 1, + # Number of weights per controller hidden layer + "hidden_size": 64, + # Number of LSTM units + "num_layers": 1, + # Number of read heads, i.e. how many addrs are read at once + "read_heads": 4, + # Number of memory cells in the controller + "nr_cells": 32, + # Size of each cell + "cell_size": 16, + # LSTM activation function + "nonlinearity": "tanh", + # Observation goes through this torch.nn.Module before + # feeding to the DNC + "preprocessor": torch.nn.Sequential(torch.nn.Linear(64, 64), torch.nn.Tanh()), + # Input size to the preprocessor + "preprocessor_input_size": 64, + # The output size of the preprocessor + # and the input size of the dnc + "preprocessor_output_size": 64, + } + + MEMORY_KEYS = [ + "memory", + "link_matrix", + "precedence", + "read_weights", + "write_weights", + "usage_vector", + ] + + def __init__( + self, + obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config: ModelConfigDict, + name: str, + **custom_model_kwargs, + ): + nn.Module.__init__(self) + super(DNCMemory, self).__init__( + obs_space, action_space, num_outputs, model_config, name + ) + self.num_outputs = num_outputs + self.obs_dim = gym.spaces.utils.flatdim(obs_space) + self.act_dim = gym.spaces.utils.flatdim(action_space) + + self.cfg = dict(self.DEFAULT_CONFIG, **custom_model_kwargs) + assert ( + self.cfg["num_layers"] == 1 + ), "num_layers != 1 has not been implemented yet" + self.cur_val = None + + self.preprocessor = torch.nn.Sequential( + torch.nn.Linear(self.obs_dim, self.cfg["preprocessor_input_size"]), + self.cfg["preprocessor"], + ) + + self.logit_branch = SlimFC( + in_size=self.cfg["hidden_size"], + out_size=self.num_outputs, + activation_fn=None, + initializer=torch.nn.init.xavier_uniform_, + ) + + self.value_branch = SlimFC( + in_size=self.cfg["hidden_size"], + out_size=1, + activation_fn=None, + initializer=torch.nn.init.xavier_uniform_, + ) + + self.dnc: Union[None, DNC] = None + + def get_initial_state(self) -> List[TensorType]: + ctrl_hidden = [ + torch.zeros(self.cfg["num_hidden_layers"], self.cfg["hidden_size"]), + torch.zeros(self.cfg["num_hidden_layers"], self.cfg["hidden_size"]), + ] + m = self.cfg["nr_cells"] + r = self.cfg["read_heads"] + w = self.cfg["cell_size"] + memory = [ + torch.zeros(m, w), # memory + torch.zeros(1, m, m), # link_matrix + torch.zeros(1, m), # precedence + torch.zeros(r, m), # read_weights + torch.zeros(1, m), # write_weights + torch.zeros(m), # usage_vector + ] + + read_vecs = torch.zeros(w * r) + + state = [*ctrl_hidden, read_vecs, *memory] + assert len(state) == 9 + return state + + def value_function(self) -> TensorType: + assert self.cur_val is not None, "must call forward() first" + return self.cur_val + + def unpack_state( + self, + state: List[TensorType], + ) -> Tuple[List[Tuple[TensorType, TensorType]], Dict[str, TensorType], TensorType]: + """Given a list of tensors, reformat for self.dnc input""" + assert len(state) == 9, "Failed to verify unpacked state" + ctrl_hidden: List[Tuple[TensorType, TensorType]] = [ + ( + state[0].permute(1, 0, 2).contiguous(), + state[1].permute(1, 0, 2).contiguous(), + ) + ] + read_vecs: TensorType = state[2] + memory: List[TensorType] = state[3:] + memory_dict: OrderedDict[str, TensorType] = OrderedDict( + zip(self.MEMORY_KEYS, memory) + ) + + return ctrl_hidden, memory_dict, read_vecs + + def pack_state( + self, + ctrl_hidden: List[Tuple[TensorType, TensorType]], + memory_dict: Dict[str, TensorType], + read_vecs: TensorType, + ) -> List[TensorType]: + """Given the dnc output, pack it into a list of tensors + for rllib state. Order is ctrl_hidden, read_vecs, memory_dict""" + state = [] + ctrl_hidden = [ + ctrl_hidden[0][0].permute(1, 0, 2), + ctrl_hidden[0][1].permute(1, 0, 2), + ] + state += ctrl_hidden + assert len(state) == 2, "Failed to verify packed state" + state.append(read_vecs) + assert len(state) == 3, "Failed to verify packed state" + state += memory_dict.values() + assert len(state) == 9, "Failed to verify packed state" + return state + + def validate_unpack(self, dnc_output, unpacked_state): + """Ensure the unpacked state shapes match the DNC output""" + s_ctrl_hidden, s_memory_dict, s_read_vecs = unpacked_state + ctrl_hidden, memory_dict, read_vecs = dnc_output + + for i in range(len(ctrl_hidden)): + for j in range(len(ctrl_hidden[i])): + assert s_ctrl_hidden[i][j].shape == ctrl_hidden[i][j].shape, ( + "Controller state mismatch: got " + f"{s_ctrl_hidden[i][j].shape} should be " + f"{ctrl_hidden[i][j].shape}" + ) + + for k in memory_dict: + assert s_memory_dict[k].shape == memory_dict[k].shape, ( + "Memory state mismatch at key " + f"{k}: got {s_memory_dict[k].shape} should be " + f"{memory_dict[k].shape}" + ) + + assert s_read_vecs.shape == read_vecs.shape, ( + "Read state mismatch: got " + f"{s_read_vecs.shape} should be " + f"{read_vecs.shape}" + ) + + def build_dnc(self, device_idx: Union[int, None]) -> None: + self.dnc = self.cfg["dnc_model"]( + input_size=self.cfg["preprocessor_output_size"], + hidden_size=self.cfg["hidden_size"], + num_layers=self.cfg["num_layers"], + num_hidden_layers=self.cfg["num_hidden_layers"], + read_heads=self.cfg["read_heads"], + cell_size=self.cfg["cell_size"], + nr_cells=self.cfg["nr_cells"], + nonlinearity=self.cfg["nonlinearity"], + gpu_id=device_idx, + ) + + def forward( + self, + input_dict: Dict[str, TensorType], + state: List[TensorType], + seq_lens: TensorType, + ) -> Tuple[TensorType, List[TensorType]]: + + flat = input_dict["obs_flat"] + # Batch and Time + # Forward expects outputs as [B, T, logits] + B = len(seq_lens) + T = flat.shape[0] // B + + # Deconstruct batch into batch and time dimensions: [B, T, feats] + flat = torch.reshape(flat, [-1, T] + list(flat.shape[1:])) + + # First run + if self.dnc is None: + gpu_id = flat.device.index if flat.device.index is not None else -1 + self.build_dnc(gpu_id) + hidden = (None, None, None) + + else: + hidden = self.unpack_state(state) # type: ignore + + # Run thru preprocessor before DNC + z = self.preprocessor(flat.reshape(B * T, self.obs_dim)) + z = z.reshape(B, T, self.cfg["preprocessor_output_size"]) + output, hidden = self.dnc(z, hidden) + packed_state = self.pack_state(*hidden) + + # Compute action/value from output + logits = self.logit_branch(output.view(B * T, -1)) + values = self.value_branch(output.view(B * T, -1)) + + self.cur_val = values.squeeze(1) + + return logits, packed_state diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/parametric_actions_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/parametric_actions_model.py new file mode 100644 index 0000000000000000000000000000000000000000..e568b8bed72ad00d740ae9be774fe43e06a44fe5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/parametric_actions_model.py @@ -0,0 +1,201 @@ +# @OldAPIStack +from gymnasium.spaces import Box + +from ray.rllib.algorithms.dqn.distributional_q_tf_model import DistributionalQTFModel +from ray.rllib.algorithms.dqn.dqn_torch_model import DQNTorchModel +from ray.rllib.models.tf.fcnet import FullyConnectedNetwork +from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.torch_utils import FLOAT_MAX, FLOAT_MIN + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class ParametricActionsModel(DistributionalQTFModel): + """Parametric action model that handles the dot product and masking. + + This assumes the outputs are logits for a single Categorical action dist. + Getting this to work with a more complex output (e.g., if the action space + is a tuple of several distributions) is also possible but left as an + exercise to the reader. + """ + + def __init__( + self, + obs_space, + action_space, + num_outputs, + model_config, + name, + true_obs_shape=(4,), + action_embed_size=2, + **kw + ): + super(ParametricActionsModel, self).__init__( + obs_space, action_space, num_outputs, model_config, name, **kw + ) + self.action_embed_model = FullyConnectedNetwork( + Box(-1, 1, shape=true_obs_shape), + action_space, + action_embed_size, + model_config, + name + "_action_embed", + ) + + def forward(self, input_dict, state, seq_lens): + # Extract the available actions tensor from the observation. + avail_actions = input_dict["obs"]["avail_actions"] + action_mask = input_dict["obs"]["action_mask"] + + # Compute the predicted action embedding + action_embed, _ = self.action_embed_model({"obs": input_dict["obs"]["cart"]}) + + # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the + # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE]. + intent_vector = tf.expand_dims(action_embed, 1) + + # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. + action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2) + + # Mask out invalid actions (use tf.float32.min for stability) + inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min) + return action_logits + inf_mask, state + + def value_function(self): + return self.action_embed_model.value_function() + + +class TorchParametricActionsModel(DQNTorchModel): + """PyTorch version of above ParametricActionsModel.""" + + def __init__( + self, + obs_space, + action_space, + num_outputs, + model_config, + name, + true_obs_shape=(4,), + action_embed_size=2, + **kw + ): + DQNTorchModel.__init__( + self, obs_space, action_space, num_outputs, model_config, name, **kw + ) + + self.action_embed_model = TorchFC( + Box(-1, 1, shape=true_obs_shape), + action_space, + action_embed_size, + model_config, + name + "_action_embed", + ) + + def forward(self, input_dict, state, seq_lens): + # Extract the available actions tensor from the observation. + avail_actions = input_dict["obs"]["avail_actions"] + action_mask = input_dict["obs"]["action_mask"] + + # Compute the predicted action embedding + action_embed, _ = self.action_embed_model({"obs": input_dict["obs"]["cart"]}) + + # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the + # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE]. + intent_vector = torch.unsqueeze(action_embed, 1) + + # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. + action_logits = torch.sum(avail_actions * intent_vector, dim=2) + + # Mask out invalid actions (use -inf to tag invalid). + # These are then recognized by the EpsilonGreedy exploration component + # as invalid actions that are not to be chosen. + inf_mask = torch.clamp(torch.log(action_mask), FLOAT_MIN, FLOAT_MAX) + + return action_logits + inf_mask, state + + def value_function(self): + return self.action_embed_model.value_function() + + +class ParametricActionsModelThatLearnsEmbeddings(DistributionalQTFModel): + """Same as the above ParametricActionsModel. + + However, this version also learns the action embeddings. + """ + + def __init__( + self, + obs_space, + action_space, + num_outputs, + model_config, + name, + true_obs_shape=(4,), + action_embed_size=2, + **kw + ): + super(ParametricActionsModelThatLearnsEmbeddings, self).__init__( + obs_space, action_space, num_outputs, model_config, name, **kw + ) + + action_ids_shifted = tf.constant( + list(range(1, num_outputs + 1)), dtype=tf.float32 + ) + + obs_cart = tf.keras.layers.Input(shape=true_obs_shape, name="obs_cart") + valid_avail_actions_mask = tf.keras.layers.Input( + shape=(num_outputs,), name="valid_avail_actions_mask" + ) + + self.pred_action_embed_model = FullyConnectedNetwork( + Box(-1, 1, shape=true_obs_shape), + action_space, + action_embed_size, + model_config, + name + "_pred_action_embed", + ) + + # Compute the predicted action embedding + pred_action_embed, _ = self.pred_action_embed_model({"obs": obs_cart}) + _value_out = self.pred_action_embed_model.value_function() + + # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the + # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE]. + intent_vector = tf.expand_dims(pred_action_embed, 1) + + valid_avail_actions = action_ids_shifted * valid_avail_actions_mask + # Embedding for valid available actions which will be learned. + # Embedding vector for 0 is an invalid embedding (a "dummy embedding"). + valid_avail_actions_embed = tf.keras.layers.Embedding( + input_dim=num_outputs + 1, + output_dim=action_embed_size, + name="action_embed_matrix", + )(valid_avail_actions) + + # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. + action_logits = tf.reduce_sum(valid_avail_actions_embed * intent_vector, axis=2) + + # Mask out invalid actions (use tf.float32.min for stability) + inf_mask = tf.maximum(tf.math.log(valid_avail_actions_mask), tf.float32.min) + + action_logits = action_logits + inf_mask + + self.param_actions_model = tf.keras.Model( + inputs=[obs_cart, valid_avail_actions_mask], + outputs=[action_logits, _value_out], + ) + self.param_actions_model.summary() + + def forward(self, input_dict, state, seq_lens): + # Extract the available actions mask tensor from the observation. + valid_avail_actions_mask = input_dict["obs"]["valid_avail_actions_mask"] + + action_logits, self._value_out = self.param_actions_model( + [input_dict["obs"]["cart"], valid_avail_actions_mask] + ) + + return action_logits, state + + def value_function(self): + return self._value_out diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/shared_weights_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/shared_weights_model.py new file mode 100644 index 0000000000000000000000000000000000000000..478859af9da3aa09b21523302747ba842364a00c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/shared_weights_model.py @@ -0,0 +1,206 @@ +# @OldAPIStack +import numpy as np + +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.torch.misc import SlimFC +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + +TF2_GLOBAL_SHARED_LAYER = None + + +class TF2SharedWeightsModel(TFModelV2): + """Example of weight sharing between two different TFModelV2s. + + NOTE: This will only work for tf2.x. When running with config.framework=tf, + use SharedWeightsModel1 and SharedWeightsModel2 below, instead! + + The shared (single) layer is simply defined outside of the two Models, + then used by both Models in their forward pass. + """ + + def __init__( + self, observation_space, action_space, num_outputs, model_config, name + ): + super().__init__( + observation_space, action_space, num_outputs, model_config, name + ) + + global TF2_GLOBAL_SHARED_LAYER + # The global, shared layer to be used by both models. + if TF2_GLOBAL_SHARED_LAYER is None: + TF2_GLOBAL_SHARED_LAYER = tf.keras.layers.Dense( + units=64, activation=tf.nn.relu, name="fc1" + ) + + inputs = tf.keras.layers.Input(observation_space.shape) + last_layer = TF2_GLOBAL_SHARED_LAYER(inputs) + output = tf.keras.layers.Dense( + units=num_outputs, activation=None, name="fc_out" + )(last_layer) + vf = tf.keras.layers.Dense(units=1, activation=None, name="value_out")( + last_layer + ) + self.base_model = tf.keras.models.Model(inputs, [output, vf]) + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + out, self._value_out = self.base_model(input_dict["obs"]) + return out, [] + + @override(ModelV2) + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + +class SharedWeightsModel1(TFModelV2): + """Example of weight sharing between two different TFModelV2s. + + NOTE: This will only work for tf1 (static graph). When running with + config.framework_str=tf2, use TF2SharedWeightsModel, instead! + + Here, we share the variables defined in the 'shared' variable scope + by entering it explicitly with tf1.AUTO_REUSE. This creates the + variables for the 'fc1' layer in a global scope called 'shared' + (outside of the Policy's normal variable scope). + """ + + def __init__( + self, observation_space, action_space, num_outputs, model_config, name + ): + super().__init__( + observation_space, action_space, num_outputs, model_config, name + ) + + inputs = tf.keras.layers.Input(observation_space.shape) + with tf1.variable_scope( + tf1.VariableScope(tf1.AUTO_REUSE, "shared"), + reuse=tf1.AUTO_REUSE, + auxiliary_name_scope=False, + ): + last_layer = tf.keras.layers.Dense( + units=64, activation=tf.nn.relu, name="fc1" + )(inputs) + output = tf.keras.layers.Dense( + units=num_outputs, activation=None, name="fc_out" + )(last_layer) + vf = tf.keras.layers.Dense(units=1, activation=None, name="value_out")( + last_layer + ) + self.base_model = tf.keras.models.Model(inputs, [output, vf]) + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + out, self._value_out = self.base_model(input_dict["obs"]) + return out, [] + + @override(ModelV2) + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + +class SharedWeightsModel2(TFModelV2): + """The "other" TFModelV2 using the same shared space as the one above.""" + + def __init__( + self, observation_space, action_space, num_outputs, model_config, name + ): + super().__init__( + observation_space, action_space, num_outputs, model_config, name + ) + + inputs = tf.keras.layers.Input(observation_space.shape) + + # Weights shared with SharedWeightsModel1. + with tf1.variable_scope( + tf1.VariableScope(tf1.AUTO_REUSE, "shared"), + reuse=tf1.AUTO_REUSE, + auxiliary_name_scope=False, + ): + last_layer = tf.keras.layers.Dense( + units=64, activation=tf.nn.relu, name="fc1" + )(inputs) + output = tf.keras.layers.Dense( + units=num_outputs, activation=None, name="fc_out" + )(last_layer) + vf = tf.keras.layers.Dense(units=1, activation=None, name="value_out")( + last_layer + ) + self.base_model = tf.keras.models.Model(inputs, [output, vf]) + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + out, self._value_out = self.base_model(input_dict["obs"]) + return out, [] + + @override(ModelV2) + def value_function(self): + return tf.reshape(self._value_out, [-1]) + + +TORCH_GLOBAL_SHARED_LAYER = None +if torch: + # The global, shared layer to be used by both models. + TORCH_GLOBAL_SHARED_LAYER = SlimFC( + 64, + 64, + activation_fn=nn.ReLU, + initializer=torch.nn.init.xavier_uniform_, + ) + + +class TorchSharedWeightsModel(TorchModelV2, nn.Module): + """Example of weight sharing between two different TorchModelV2s. + + The shared (single) layer is simply defined outside of the two Models, + then used by both Models in their forward pass. + """ + + def __init__( + self, observation_space, action_space, num_outputs, model_config, name + ): + TorchModelV2.__init__( + self, observation_space, action_space, num_outputs, model_config, name + ) + nn.Module.__init__(self) + + # Non-shared initial layer. + self.first_layer = SlimFC( + int(np.prod(observation_space.shape)), + 64, + activation_fn=nn.ReLU, + initializer=torch.nn.init.xavier_uniform_, + ) + + # Non-shared final layer. + self.last_layer = SlimFC( + 64, + self.num_outputs, + activation_fn=None, + initializer=torch.nn.init.xavier_uniform_, + ) + self.vf = SlimFC( + 64, + 1, + activation_fn=None, + initializer=torch.nn.init.xavier_uniform_, + ) + self._global_shared_layer = TORCH_GLOBAL_SHARED_LAYER + self._output = None + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + out = self.first_layer(input_dict["obs"]) + self._output = self._global_shared_layer(out) + model_out = self.last_layer(self._output) + return model_out, [] + + @override(ModelV2) + def value_function(self): + assert self._output is not None, "must call forward first!" + return torch.reshape(self.vf(self._output), [-1]) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/simple_rpg_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/simple_rpg_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b37d915df8a18101435ea9acd59b14f71e39b74d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/models/simple_rpg_model.py @@ -0,0 +1,65 @@ +# @OldAPIStack +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as TFFCNet +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + + +class CustomTorchRPGModel(TorchModelV2, nn.Module): + """Example of interpreting repeated observations.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + super().__init__(obs_space, action_space, num_outputs, model_config, name) + nn.Module.__init__(self) + self.model = TorchFCNet( + obs_space, action_space, num_outputs, model_config, name + ) + + def forward(self, input_dict, state, seq_lens): + # The unpacked input tensors, where M=MAX_PLAYERS, N=MAX_ITEMS: + # { + # 'items', , + # 'location', , + # 'status', , + # } + print("The unpacked input tensors:", input_dict["obs"]) + print() + print("Unbatched repeat dim", input_dict["obs"].unbatch_repeat_dim()) + print() + print("Fully unbatched", input_dict["obs"].unbatch_all()) + print() + return self.model.forward(input_dict, state, seq_lens) + + def value_function(self): + return self.model.value_function() + + +class CustomTFRPGModel(TFModelV2): + """Example of interpreting repeated observations.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, name): + super().__init__(obs_space, action_space, num_outputs, model_config, name) + self.model = TFFCNet(obs_space, action_space, num_outputs, model_config, name) + + def forward(self, input_dict, state, seq_lens): + # The unpacked input tensors, where M=MAX_PLAYERS, N=MAX_ITEMS: + # { + # 'items', , + # 'location', , + # 'status', , + # } + print("The unpacked input tensors:", input_dict["obs"]) + print() + print("Unbatched repeat dim", input_dict["obs"].unbatch_repeat_dim()) + print() + if tf.executing_eagerly(): + print("Fully unbatched", input_dict["obs"].unbatch_all()) + print() + return self.model.forward(input_dict, state, seq_lens) + + def value_function(self): + return self.model.value_function() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..e5df2f821c49aa41f4ddd9988c1acbaaf7a63819 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole.py @@ -0,0 +1,121 @@ +# @OldAPIStack +"""Example of handling variable length or parametric action spaces. + +This toy example demonstrates the action-embedding based approach for handling large +discrete action spaces (potentially infinite in size), similar to this example: + + https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/ + +This example works with RLlib's policy gradient style algorithms +(e.g., PG, PPO, IMPALA, A2C) and DQN. + +Note that since the model outputs now include "-inf" tf.float32.min +values, not all algorithm options are supported. For example, +algorithms might crash if they don't properly ignore the -inf action scores. +Working configurations are given below. +""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.examples.envs.classes.parametric_actions_cartpole import ( + ParametricActionsCartPole, +) +from ray.rllib.examples._old_api_stack.models.parametric_actions_model import ( + ParametricActionsModel, + TorchParametricActionsModel, +) +from ray.rllib.models import ModelCatalog +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.tune.registry import register_env + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." +) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=200, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", type=float, default=150.0, help="Reward at which we stop training." +) + +if __name__ == "__main__": + args = parser.parse_args() + ray.init() + + register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) + ModelCatalog.register_custom_model( + "pa_model", + TorchParametricActionsModel + if args.framework == "torch" + else ParametricActionsModel, + ) + + if args.run == "DQN": + cfg = { + # TODO(ekl) we need to set these to prevent the masked values + # from being further processed in DistributionalQModel, which + # would mess up the masking. It is possible to support these if we + # defined a custom DistributionalQModel that is aware of masking. + "hiddens": [], + "dueling": False, + "enable_rl_module_and_learner": False, + "enable_env_runner_and_connector_v2": False, + } + else: + cfg = {} + + config = dict( + { + "env": "pa_cartpole", + "model": { + "custom_model": "pa_model", + }, + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), + "num_env_runners": 0, + "framework": args.framework, + }, + **cfg, + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + results = tune.Tuner( + args.run, + run_config=air.RunConfig(stop=stop, verbose=1), + param_space=config, + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py new file mode 100644 index 0000000000000000000000000000000000000000..476e8b81eece613b6e1797b5623581cd3b2343ec --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py @@ -0,0 +1,107 @@ +# @OldAPIStack +"""Example of handling variable length or parametric action spaces. + +This is a toy example of the action-embedding based approach for handling large +discrete action spaces (potentially infinite in size), similar to this: + + https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/ + +This currently works with RLlib's policy gradient style algorithms +(e.g., PG, PPO, IMPALA, A2C) and also DQN. + +Note that since the model outputs now include "-inf" tf.float32.min +values, not all algorithm options are supported at the moment. For example, +algorithms might crash if they don't properly ignore the -inf action scores. +Working configurations are given below. +""" + +import argparse +import os + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.examples.envs.classes.parametric_actions_cartpole import ( + ParametricActionsCartPoleNoEmbeddings, +) +from ray.rllib.examples._old_api_stack.models.parametric_actions_model import ( + ParametricActionsModelThatLearnsEmbeddings, +) +from ray.rllib.models import ModelCatalog +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.tune.registry import register_env + +parser = argparse.ArgumentParser() +parser.add_argument("--run", type=str, default="PPO") +parser.add_argument( + "--framework", + choices=["tf", "tf2"], + default="tf", + help="The DL framework specifier (Torch not supported " + "due to the lack of a model).", +) +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-reward", type=float, default=150.0) +parser.add_argument("--stop-timesteps", type=int, default=100000) + +if __name__ == "__main__": + args = parser.parse_args() + ray.init() + + register_env("pa_cartpole", lambda _: ParametricActionsCartPoleNoEmbeddings(10)) + + ModelCatalog.register_custom_model( + "pa_model", ParametricActionsModelThatLearnsEmbeddings + ) + + if args.run == "DQN": + cfg = { + # TODO(ekl) we need to set these to prevent the masked values + # from being further processed in DistributionalQModel, which + # would mess up the masking. It is possible to support these if we + # defined a custom DistributionalQModel that is aware of masking. + "hiddens": [], + "dueling": False, + "enable_rl_module_and_learner": False, + "enable_env_runner_and_connector_v2": False, + } + else: + cfg = {} + + config = dict( + { + "env": "pa_cartpole", + "model": { + "custom_model": "pa_model", + }, + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), + "num_env_runners": 0, + "framework": args.framework, + "action_mask_key": "valid_avail_actions_mask", + }, + **cfg, + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + results = tune.Tuner( + args.run, + run_config=air.RunConfig(stop=stop, verbose=2), + param_space=config, + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..291a03371a1328cada93cbdb75710bba822b596a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/cartpole_dqn_export.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/cartpole_dqn_export.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a89b1c9d39abccea52b84ba9c1a3d211fa9c92da Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/cartpole_dqn_export.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/change_config_during_training.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/change_config_during_training.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd94db1e38065b507a63d5b4c2e5d28eebd14d76 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/change_config_during_training.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/checkpoint_by_custom_criteria.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/checkpoint_by_custom_criteria.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3d4f2d5a14f5766a8e9805ddb7837a8ad1d9982 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/checkpoint_by_custom_criteria.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/restore_1_of_n_agents_from_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/restore_1_of_n_agents_from_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9aa6b7406eaffea6a07f0698f6d1a06092e19089 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/checkpoints/__pycache__/restore_1_of_n_agents_from_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..090906559f1e8e5591e324a88a3ff922a42960fd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/custom_heuristic_policy.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/custom_heuristic_policy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f07faee83f8e441480abe0d3116e21a61ee837d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/custom_heuristic_policy.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/different_spaces_for_agents.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/different_spaces_for_agents.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f82828ad7cb44d3e5eb9997fbb5ffda8083b7ab1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/different_spaces_for_agents.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_cartpole.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a32ca68beceb55fa4b15159412cafa250a763917 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_cartpole.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_pendulum.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_pendulum.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..129c658eb740059968d59aa1781894f2dc552eda Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/multi_agent_pendulum.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_independent_learning.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_independent_learning.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ef87cc56c3140521f1e278dd42d9e8dddfba175 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_independent_learning.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_parameter_sharing.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_parameter_sharing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bdfd9087353ce980e0cbbbd5956285df1642601 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_parameter_sharing.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_shared_value_function.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_shared_value_function.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1a8b98be8f176c26c0451ff69eabd928442d62c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/pettingzoo_shared_value_function.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_heuristic_vs_learned.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_heuristic_vs_learned.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d70f60301c2fdef9f26acbe4ddf36627bda15768 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_heuristic_vs_learned.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_learned_vs_learned.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_learned_vs_learned.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c583a58d543c15c6c5c29a7ccd3121378299a039 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/rock_paper_scissors_learned_vs_learned.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_league_based_with_open_spiel.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_league_based_with_open_spiel.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b404fc48db1893d388051b41b31c538706b00eb8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_league_based_with_open_spiel.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_with_open_spiel.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_with_open_spiel.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89f532e25de900110c41f16b1ff76923c7ac9cae Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/self_play_with_open_spiel.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/two_step_game_with_grouped_agents.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/two_step_game_with_grouped_agents.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15e6ccee70d35a11ac78f532b04e9c1fb6988588 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/__pycache__/two_step_game_with_grouped_agents.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/custom_heuristic_policy.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/custom_heuristic_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..3f5d4c6a067d241c47d79c3b6c4114f142445cfb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/custom_heuristic_policy.py @@ -0,0 +1,101 @@ +"""Example of running a custom heuristic (hand-coded) policy alongside trainable ones. + +This example has two RLModules (as action computing policies): + (1) one trained by a PPOLearner + (2) one hand-coded policy that acts at random in the env (doesn't learn). + +The environment is MultiAgentCartPole, in which there are n agents both policies + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see the PPO policy ("learnable_policy") does much +better than "random": + ++-------------------+------------+----------+------+----------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|-------------------+------------+----------+------+----------------+ +| PPO_multi_agen... | TERMINATED | 127. ... | 20 | 58.646 | ++-------------------+------------+----------+------+----------------+ + ++--------+-------------------+-----------------+--------------------+ +| ts | combined reward | reward random | reward | +| | | | learnable_policy | ++--------+-------------------+-----------------+--------------------| +| 80000 | 481.26 | 78.41 | 464.41 | ++--------+-------------------+-----------------+--------------------+ +""" + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import register_env + + +parser = add_rllib_example_script_args( + default_iters=40, default_reward=500.0, default_timesteps=200000 +) +parser.set_defaults( + enable_new_api_stack=True, + num_agents=2, +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + + # Simple environment with n independent cartpole entities. + register_env( + "multi_agent_cartpole", + lambda _: MultiAgentCartPole({"num_agents": args.num_agents}), + ) + + base_config = ( + PPOConfig() + .environment("multi_agent_cartpole") + .multi_agent( + policies={"learnable_policy", "random"}, + # Map to either random behavior or PPO learning behavior based on + # the agent's ID. + policy_mapping_fn=lambda agent_id, *args, **kwargs: [ + "learnable_policy", + "random", + ][agent_id % 2], + # We need to specify this here, b/c the `forward_train` method of + # `RandomRLModule` (ModuleID="random") throws a not-implemented error. + policies_to_train=["learnable_policy"], + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={ + "learnable_policy": RLModuleSpec(), + "random": RLModuleSpec(module_class=RandomRLModule), + } + ), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/different_spaces_for_agents.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/different_spaces_for_agents.py new file mode 100644 index 0000000000000000000000000000000000000000..7331a3e3aadc39b53fe045aac4fc1352e1b5f7b5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/different_spaces_for_agents.py @@ -0,0 +1,112 @@ +""" +Example showing how to create a multi-agent env, in which the different agents +have different observation and action spaces. + +These spaces do NOT necessarily have to be specified manually by the user. Instead, +RLlib tries to automatically infer them from the env provided spaces dicts +(agentID -> obs/act space) and the policy mapping fn (mapping agent IDs to policy IDs). + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" + +import gymnasium as gym + +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + + +class BasicMultiAgentMultiSpaces(MultiAgentEnv): + """A simple multi-agent example environment where agents have different spaces. + + agent0: obs=Box(10,), act=Discrete(2) + agent1: obs=Box(20,), act=Discrete(3) + + The logic of the env doesn't really matter for this example. The point of this env + is to show how to use multi-agent envs, in which the different agents utilize + different obs- and action spaces. + """ + + def __init__(self, config=None): + self.agents = ["agent0", "agent1"] + + self.terminateds = set() + self.truncateds = set() + + # Provide full (preferred format) observation- and action-spaces as Dicts + # mapping agent IDs to the individual agents' spaces. + self.observation_spaces = { + "agent0": gym.spaces.Box(low=-1.0, high=1.0, shape=(10,)), + "agent1": gym.spaces.Box(low=-1.0, high=1.0, shape=(20,)), + } + self.action_spaces = { + "agent0": gym.spaces.Discrete(2), + "agent1": gym.spaces.Discrete(3), + } + + super().__init__() + + def reset(self, *, seed=None, options=None): + self.terminateds = set() + self.truncateds = set() + return {i: self.get_observation_space(i).sample() for i in self.agents}, {} + + def step(self, action_dict): + obs, rew, terminated, truncated, info = {}, {}, {}, {}, {} + for i, action in action_dict.items(): + obs[i] = self.get_observation_space(i).sample() + rew[i] = 0.0 + terminated[i] = False + truncated[i] = False + info[i] = {} + terminated["__all__"] = len(self.terminateds) == len(self.agents) + truncated["__all__"] = len(self.truncateds) == len(self.agents) + return obs, rew, terminated, truncated, info + + +parser = add_rllib_example_script_args( + default_iters=10, default_reward=80.0, default_timesteps=10000 +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment(env=BasicMultiAgentMultiSpaces) + .training(train_batch_size=1024) + .multi_agent( + # Use a simple set of policy IDs. Spaces for the individual policies + # are inferred automatically using reverse lookup via the + # `policy_mapping_fn` and the env provided spaces for the different + # agents. Alternatively, you could use: + # policies: {main0: PolicySpec(...), main1: PolicySpec} + policies={"main0", "main1"}, + # Simple mapping fn, mapping agent0 to main0 and agent1 to main1. + policy_mapping_fn=(lambda aid, episode, **kw: f"main{aid[-1]}"), + # Only train main0. + policies_to_train=["main0"], + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..4bdf019f10b17cc26bd31bd5c516d4de0fe443f5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_cartpole.py @@ -0,0 +1,67 @@ +"""Simple example of setting up an agent-to-module mapping function. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" + +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=100000, + default_reward=600.0, +) +# TODO (sven): This arg is currently ignored (hard-set to 2). +parser.add_argument("--num-policies", type=int, default=2) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}), + ) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env" if args.num_agents > 0 else "CartPole-v1") + .env_runners( + # TODO (sven): MAEnvRunner does not support vectorized envs yet + # due to gym's env checkers and non-compatability with RLlib's + # MultiAgentEnv API. + num_envs_per_env_runner=1 + if args.num_agents > 0 + else 20, + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_pendulum.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_pendulum.py new file mode 100644 index 0000000000000000000000000000000000000000..985e55aada326bc68b1fa74f9595411611b8c12e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/multi_agent_pendulum.py @@ -0,0 +1,73 @@ +"""Simple example of setting up an agent-to-module mapping function. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" + +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=100000, + default_reward=-400.0, +) +# TODO (sven): This arg is currently ignored (hard-set to 2). +parser.add_argument("--num-policies", type=int, default=2) + + +if __name__ == "__main__": + args = parser.parse_args() + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}), + ) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env" if args.num_agents > 0 else "Pendulum-v1") + .training( + train_batch_size_per_learner=512, + minibatch_size=64, + lambda_=0.1, + gamma=0.95, + lr=0.0003, + model={"fcnet_activation": "relu"}, + vf_clip_param=10.0, + ) + .rl_module( + model_config=DefaultModelConfig(fcnet_activation="relu"), + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Augment + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_independent_learning.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_independent_learning.py new file mode 100644 index 0000000000000000000000000000000000000000..7b25115cb7a41aad15ef46f85380b6dcc83299a2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_independent_learning.py @@ -0,0 +1,108 @@ +"""Runs the PettingZoo Waterworld env in RLlib using independent multi-agent learning. + +See: https://pettingzoo.farama.org/environments/sisl/waterworld/ +for more details on the environment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +This works with hundreds of agents and policies, but note that initializing +many policies might take some time. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +The above options can reach a combined reward of 0.0 or more after about 500k env +timesteps. Keep in mind, though, that due to the separate value functions (and +learned policies in general), one agent's gain (in per-agent reward) might cause the +other agent's reward to decrease at the same time. However, over time, both agents +should simply improve. + ++---------------------+------------+-----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +|---------------------+------------+-----------------+--------+------------------+ +| PPO_env_a82fc_00000 | TERMINATED | 127.0.0.1:28346 | 124 | 363.599 | ++---------------------+------------+-----------------+--------+------------------+ + ++--------+-------------------+--------------------+--------------------+ +| ts | combined reward | reward pursuer_1 | reward pursuer_0 | ++--------+-------------------+--------------------+--------------------| +| 496000 | 2.24542 | -34.6869 | 36.9324 | ++--------+-------------------+--------------------+--------------------+ + +Note that the two agents (`pursuer_0` and `pursuer_1`) are optimized on the exact same +objective and thus differences in the rewards can be attributed to weight initialization +(and sampling randomness) only. +""" + +from pettingzoo.sisl import waterworld_v4 + +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=1000000, + default_reward=0.0, +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents > 0, "Must set --num-agents > 0 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + # Here, we use the "Agent Environment Cycle" (AEC) PettingZoo environment type. + # For a "Parallel" environment example, see the rock paper scissors examples + # in this same repository folder. + register_env("env", lambda _: PettingZooEnv(waterworld_v4.env())) + + # Policies are called just like the agents (exact 1:1 mapping). + policies = {f"pursuer_{i}" for i in range(args.num_agents)} + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env") + .multi_agent( + policies=policies, + # Exact 1:1 mapping from AgentID to ModuleID. + policy_mapping_fn=(lambda aid, *args, **kwargs: aid), + ) + .training( + vf_loss_coeff=0.005, + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={p: RLModuleSpec() for p in policies}, + ), + model_config=DefaultModelConfig(vf_share_layers=True), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_parameter_sharing.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_parameter_sharing.py new file mode 100644 index 0000000000000000000000000000000000000000..d6eb4bda732e7ba9f41ba3cc39aae7e91acf7c37 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_parameter_sharing.py @@ -0,0 +1,105 @@ +"""Runs the PettingZoo Waterworld multi-agent env in RLlib using single policy learning. + +Other than the `pettingzoo_independent_learning.py` example (in this same folder), +this example simply trains a single policy (shared by all agents). + +See: https://pettingzoo.farama.org/environments/sisl/waterworld/ +for more details on the environment. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Control the number of agents and policies (RLModules) via --num-agents and +--num-policies. + +This works with hundreds of agents and policies, but note that initializing +many policies might take some time. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +The above options can reach a combined reward of roughly ~0.0 after about 500k-1M env +timesteps. Keep in mind, though, that in this setup, the agents do not have the +opportunity to benefit from or even out other agents' mistakes (and behavior in general) +as everyone is using the same policy. Hence, this example learns a more generic policy, +which might be less specialized to certain "niche exploitation opportunities" inside +the env: + ++---------------------+----------+-----------------+--------+-----------------+ +| Trial name | status | loc | iter | total time (s) | +|---------------------+----------+-----------------+--------+-----------------+ +| PPO_env_91f49_00000 | RUNNING | 127.0.0.1:63676 | 200 | 605.176 | ++---------------------+----------+-----------------+--------+-----------------+ + ++--------+-------------------+-------------+ +| ts | combined reward | reward p0 | ++--------+-------------------+-------------| +| 800000 | 0.323752 | 0.161876 | ++--------+-------------------+-------------+ +""" +from pettingzoo.sisl import waterworld_v4 + +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + + +parser = add_rllib_example_script_args( + default_iters=200, + default_timesteps=1000000, + default_reward=0.0, +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents > 0, "Must set --num-agents > 0 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + # Here, we use the "Agent Environment Cycle" (AEC) PettingZoo environment type. + # For a "Parallel" environment example, see the rock paper scissors examples + # in this same repository folder. + register_env("env", lambda _: PettingZooEnv(waterworld_v4.env())) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("env") + .multi_agent( + policies={"p0"}, + # All agents map to the exact same policy. + policy_mapping_fn=(lambda aid, *args, **kwargs: "p0"), + ) + .training( + model={ + "vf_share_layers": True, + }, + vf_loss_coeff=0.005, + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={"p0": RLModuleSpec()}, + ), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_shared_value_function.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_shared_value_function.py new file mode 100644 index 0000000000000000000000000000000000000000..e2c8bb9a4ffb933f99f32c65eaa167e0d75e196d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/pettingzoo_shared_value_function.py @@ -0,0 +1,7 @@ +msg = """ +This script is NOT yet ready, but will be available soon at this location. It will +feature a MultiRLModule with one shared value function and n policy heads for +cooperative multi-agent learning. +""" + +raise NotImplementedError(msg) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py new file mode 100644 index 0000000000000000000000000000000000000000..6f474e8e3c69ad699b9e67890cecddb96446c068 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py @@ -0,0 +1,154 @@ +"""A simple multi-agent env with two agents play rock paper scissors. + +This demonstrates running the following policies in competition: + Agent 1: heuristic policy of repeating the same move + OR: heuristic policy of beating the last opponent move + Agent 2: Simple, feedforward PPO policy + OR: PPO Policy with an LSTM network + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2 [--use-lstm]?` + +Without `--use-lstm`, Agent 2 should quickly reach a reward of ~7.0, always +beating the `always_same` policy, but only 50% of the time beating the `beat_last` +policy. + +With `--use-lstm`, Agent 2 should eventually(!) reach a reward of >9.0 (always +beating both the `always_same` policy and the `beat_last` policy). + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` +""" + +import random + +import gymnasium as gym +from pettingzoo.classic import rps_v2 + +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.rllib.examples.rl_modules.classes import ( + AlwaysSameHeuristicRLM, + BeatLastHeuristicRLM, +) +from ray.tune.registry import get_trainable_cls, register_env + + +parser = add_rllib_example_script_args( + default_iters=50, + default_timesteps=200000, + default_reward=6.0, +) +parser.set_defaults( + enable_new_api_stack=True, + num_agents=2, +) +parser.add_argument( + "--use-lstm", + action="store_true", + help="Whether to use an LSTM wrapped module instead of a simple MLP one. With LSTM " + "the reward diff can reach 7.0, without only 5.0.", +) + + +register_env( + "pettingzoo_rps", + lambda _: ParallelPettingZooEnv(rps_v2.parallel_env()), +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("pettingzoo_rps") + .env_runners( + env_to_module_connector=lambda env: ( + # `agent_ids=...`: Only flatten obs for the learning RLModule. + FlattenObservations(multi_agent=True, agent_ids={"player_0"}), + ), + ) + .multi_agent( + policies={"always_same", "beat_last", "learned"}, + # Let learning Policy always play against either heuristic one: + # `always_same` or `beat_last`. + policy_mapping_fn=lambda aid, episode: ( + "learned" + if aid == "player_0" + else random.choice(["always_same", "beat_last"]) + ), + # Must define this as both heuristic RLMs will throw an error, if their + # `forward_train` is called. + policies_to_train=["learned"], + ) + .training( + vf_loss_coeff=0.005, + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={ + "always_same": RLModuleSpec( + module_class=AlwaysSameHeuristicRLM, + observation_space=gym.spaces.Discrete(4), + action_space=gym.spaces.Discrete(3), + ), + "beat_last": RLModuleSpec( + module_class=BeatLastHeuristicRLM, + observation_space=gym.spaces.Discrete(4), + action_space=gym.spaces.Discrete(3), + ), + "learned": RLModuleSpec( + model_config=DefaultModelConfig( + use_lstm=args.use_lstm, + # Use a simpler FCNet when we also have an LSTM. + fcnet_hiddens=[32] if args.use_lstm else [256, 256], + lstm_cell_size=256, + max_seq_len=15, + vf_share_layers=True, + ), + ), + } + ), + ) + ) + + # Make `args.stop_reward` "point" to the reward of the learned policy. + stop = { + TRAINING_ITERATION: args.stop_iters, + f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/learned": args.stop_reward, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + } + + run_rllib_example_script_experiment( + base_config, + args, + stop=stop, + success_metric={ + f"{ENV_RUNNER_RESULTS}/module_episode_returns_mean/learned": ( + args.stop_reward + ), + }, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py new file mode 100644 index 0000000000000000000000000000000000000000..adf88dba985b4862767467764ff8e7f012fafb37 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py @@ -0,0 +1,91 @@ +"""A simple multi-agent env with two agents play rock paper scissors. + +This demonstrates running two learning policies in competition, both using the same +RLlib algorithm (PPO by default). + +The combined reward as well as individual rewards should roughly remain at 0.0 as no +policy should - in the long run - be able to learn a better strategy than chosing +actions at random. However, it could be possible that - for some time - one or the other +policy can exploit a "stochastic weakness" of the opponent policy. For example a policy +`A` learns that its opponent `B` has learnt to choose "paper" more often, which in +return makes `A` choose "scissors" more often as a countermeasure. +""" + +import re + +from pettingzoo.classic import rps_v2 + +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + + +parser = add_rllib_example_script_args( + default_iters=50, + default_timesteps=200000, + default_reward=6.0, +) +parser.set_defaults( + enable_new_api_stack=True, + num_agents=2, +) +parser.add_argument( + "--use-lstm", + action="store_true", + help="Whether to use an LSTM wrapped module instead of a simple MLP one. With LSTM " + "the reward diff can reach 7.0, without only 5.0.", +) + + +register_env( + "pettingzoo_rps", + lambda _: ParallelPettingZooEnv(rps_v2.parallel_env()), +) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("pettingzoo_rps") + .env_runners( + env_to_module_connector=lambda env: FlattenObservations(multi_agent=True), + ) + .multi_agent( + policies={"p0", "p1"}, + # `player_0` uses `p0`, `player_1` uses `p1`. + policy_mapping_fn=lambda aid, episode: re.sub("^player_", "p", aid), + ) + .training( + vf_loss_coeff=0.005, + ) + .rl_module( + model_config=DefaultModelConfig( + use_lstm=args.use_lstm, + # Use a simpler FCNet when we also have an LSTM. + fcnet_hiddens=[32] if args.use_lstm else [256, 256], + lstm_cell_size=256, + max_seq_len=15, + vf_share_layers=True, + ), + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={ + "p0": RLModuleSpec(), + "p1": RLModuleSpec(), + } + ), + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py new file mode 100644 index 0000000000000000000000000000000000000000..859d4d9c01ddacef54f0d78be555529714005dbb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py @@ -0,0 +1,285 @@ +"""Example showing how to implement a league-based training workflow. + +Uses the open spiel adapter of RLlib with the "markov_soccer" game and +a simplified multi-agent, league-based setup: +https://deepmind.com/blog/article/AlphaStar-Grandmaster-level-in- \ +StarCraft-II-using-multi-agent-reinforcement-learning + +Our league consists of three groups of policies: +- main policies: The current main policy plus prior versions of it. +- main exploiters: Trained by playing only against different "main policies". +- league exploiters: Trained by playing against any policy in the league. + +We start with 1 policy from each group, setting all 3 of these to an initial +PPO policy and allowing all 3 policies to be trained. +After each train update - via our custom callback - we decide for each +trainable policy, whether to make a copy and freeze it. Frozen policies +will not be altered anymore. However, they remain in the league for +future matches against trainable policies. +Matchmaking happens via a policy_mapping_fn, which needs to be altered +after every change (addition) to the league. The mapping function +randomly maps agents in a way, such that: +- Frozen main exploiters play against the one (currently trainable) main + policy. +- Trainable main exploiters play against any main policy (including already + frozen main policies). +- Frozen league exploiters play against any trainable policy in the league. +- Trainable league exploiters play against any policy in the league. + +After training for n iterations, a configurable number of episodes can +be played by the user against the "main" agent on the command line. +""" +import functools + +import numpy as np +import torch + +import ray +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel +from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv +from ray.rllib.examples.multi_agent.utils import ( + ask_user_for_action, + SelfPlayLeagueBasedCallback, + SelfPlayLeagueBasedCallbackOldAPIStack, +) +from ray.rllib.examples._old_api_stack.policy.random_policy import RandomPolicy +from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule +from ray.rllib.policy.policy import PolicySpec +from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +open_spiel = try_import_open_spiel(error=True) +pyspiel = try_import_pyspiel(error=True) + +# Import after try_import_open_spiel, so we can error out with hints +from open_spiel.python.rl_environment import Environment # noqa: E402 + + +parser = add_rllib_example_script_args(default_timesteps=2000000) +parser.set_defaults( + env="markov_soccer", + num_env_runners=2, + checkpoint_freq=1, + checkpoint_at_end=True, +) +parser.add_argument( + "--win-rate-threshold", + type=float, + default=0.85, + help="Win-rate at which we setup another opponent by freezing the " + "current main policy and playing against a uniform distribution " + "of previously frozen 'main's from here on.", +) +parser.add_argument( + "--min-league-size", + type=float, + default=8, + help="Minimum number of policies/RLModules to consider the test passed. " + "The initial league size is 2: `main` and `random`. " + "`--min-league-size=3` thus means that one new policy/RLModule has been " + "added so far (b/c the `main` one has reached the `--win-rate-threshold " + "against the `random` Policy/RLModule).", +) +parser.add_argument( + "--num-episodes-human-play", + type=int, + default=0, + help="How many episodes to play against the user on the command " + "line after training has finished.", +) +parser.add_argument( + "--from-checkpoint", + type=str, + default=None, + help="Full path to a checkpoint file for restoring a previously saved " + "Algorithm state.", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + register_env( + "open_spiel_env", + lambda _: OpenSpielEnv(pyspiel.load_game(args.env)), + ) + + def policy_mapping_fn(agent_id, episode, worker=None, **kwargs): + # At first, only have main play against the random main exploiter. + return "main" if episode.episode_id % 2 == agent_id else "main_exploiter_0" + + def agent_to_module_mapping_fn(agent_id, episode, **kwargs): + # At first, only have main play against the random main exploiter. + return "main" if hash(episode.id_) % 2 == agent_id else "main_exploiter_0" + + def _get_multi_agent(): + names = { + # Our main policy, we'd like to optimize. + "main", + # First frozen version of main (after we reach n% win-rate). + "main_0", + # Initial main exploiters (one random, one trainable). + "main_exploiter_0", + "main_exploiter_1", + # Initial league exploiters (one random, one trainable). + "league_exploiter_0", + "league_exploiter_1", + } + if args.enable_new_api_stack: + policies = names + spec = { + mid: RLModuleSpec( + module_class=( + RandomRLModule + if mid in ["main_exploiter_0", "league_exploiter_0"] + else None + ), + model_config=DefaultModelConfig( + fcnet_hiddens=[1024, 1024], + # fcnet_activation="tanh", + ), + ) + for mid in names + } + else: + policies = { + mid: PolicySpec( + policy_class=( + RandomPolicy + if mid in ["main_exploiter_0", "league_exploiter_0"] + else None + ) + ) + for mid in names + } + spec = None + return {"policies": policies, "spec": spec} + + config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("open_spiel_env") + # Set up the main piece in this experiment: The league-bases self-play + # callback, which controls adding new policies/Modules to the league and + # properly matching the different policies in the league with each other. + .callbacks( + functools.partial( + SelfPlayLeagueBasedCallback + if args.enable_new_api_stack + else SelfPlayLeagueBasedCallbackOldAPIStack, + win_rate_threshold=args.win_rate_threshold, + ) + ) + .env_runners( + num_envs_per_env_runner=1 if args.enable_new_api_stack else 5, + ) + .training( + num_epochs=20, + ) + .multi_agent( + # Initial policy map: All PPO. This will be expanded + # to more policy snapshots. This is done in the + # custom callback defined above (`LeagueBasedSelfPlayCallback`). + policies=_get_multi_agent()["policies"], + policy_mapping_fn=( + agent_to_module_mapping_fn + if args.enable_new_api_stack + else policy_mapping_fn + ), + # At first, only train main_0 (until good enough to win against + # random). + policies_to_train=["main"], + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs=_get_multi_agent()["spec"] + ), + ) + ) + + # Run everything as configured. + # Train the "main" policy to play really well using self-play. + results = None + if not args.from_checkpoint: + stop = { + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + TRAINING_ITERATION: args.stop_iters, + "league_size": args.min_league_size, + } + results = run_rllib_example_script_experiment( + config, args, stop=stop, keep_ray_up=True + ) + + # Restore trained Algorithm (set to non-explore behavior) and play against + # human on command line. + if args.num_episodes_human_play > 0: + num_episodes = 0 + # Switch off exploration for better inference performance. + config.explore = False + algo = config.build() + if args.from_checkpoint: + algo.restore(args.from_checkpoint) + else: + checkpoint = results.get_best_result().checkpoint + if not checkpoint: + raise ValueError("No last checkpoint found in results!") + algo.restore(checkpoint) + + if args.enable_new_api_stack: + rl_module = algo.get_module("main") + + # Play from the command line against the trained agent + # in an actual (non-RLlib-wrapped) open-spiel env. + human_player = 1 + env = Environment(args.env) + + while num_episodes < args.num_episodes_human_play: + print("You play as {}".format("o" if human_player else "x")) + time_step = env.reset() + while not time_step.last(): + player_id = time_step.observations["current_player"] + if player_id == human_player: + action = ask_user_for_action(time_step) + else: + obs = np.array(time_step.observations["info_state"][player_id]) + if args.enable_new_api_stack: + action = np.argmax( + rl_module.forward_inference( + {"obs": torch.from_numpy(obs).unsqueeze(0).float()} + )["action_dist_inputs"][0].numpy() + ) + else: + action = algo.compute_single_action(obs, policy_id="main") + # In case computer chooses an invalid action, pick a + # random one. + legal = time_step.observations["legal_actions"][player_id] + if action not in legal: + action = np.random.choice(legal) + time_step = env.step([action]) + print(f"\n{env.get_state}") + + print(f"\n{env.get_state}") + + print("End of game!") + if time_step.rewards[human_player] > 0: + print("You win") + elif time_step.rewards[human_player] < 0: + print("You lose") + else: + print("Draw") + # Switch order of players + human_player = 1 - human_player + + num_episodes += 1 + + algo.stop() + + ray.shutdown() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_with_open_spiel.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_with_open_spiel.py new file mode 100644 index 0000000000000000000000000000000000000000..37be03d53622c7ffece749d5f5a11b17f3c19753 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/self_play_with_open_spiel.py @@ -0,0 +1,245 @@ +"""Example showing how one can implement a simple self-play training workflow. + +Uses the open spiel adapter of RLlib with the "connect_four" game and +a multi-agent setup with a "main" policy and n "main_v[x]" policies +(x=version number), which are all at-some-point-frozen copies of +"main". At the very beginning, "main" plays against RandomPolicy. + +Checks for the training progress after each training update via a custom +callback. We simply measure the win rate of "main" vs the opponent +("main_v[x]" or RandomPolicy at the beginning) by looking through the +achieved rewards in the episodes in the train batch. If this win rate +reaches some configurable threshold, we add a new policy to +the policy map (a frozen copy of the current "main" one) and change the +policy_mapping_fn to make new matches of "main" vs any of the previous +versions of "main" (including the just added one). + +After training for n iterations, a configurable number of episodes can +be played by the user against the "main" agent on the command line. +""" + +import functools + +import numpy as np +import torch + +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel +from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv +from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule +from ray.rllib.examples.multi_agent.utils import ( + ask_user_for_action, + SelfPlayCallback, + SelfPlayCallbackOldAPIStack, +) +from ray.rllib.examples._old_api_stack.policy.random_policy import RandomPolicy +from ray.rllib.policy.policy import PolicySpec +from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls, register_env + +open_spiel = try_import_open_spiel(error=True) +pyspiel = try_import_pyspiel(error=True) + +# Import after try_import_open_spiel, so we can error out with hints. +from open_spiel.python.rl_environment import Environment # noqa: E402 + + +parser = add_rllib_example_script_args(default_timesteps=2000000) +parser.set_defaults( + env="connect_four", + checkpoint_freq=1, + checkpoint_at_end=True, +) +parser.add_argument( + "--win-rate-threshold", + type=float, + default=0.95, + help="Win-rate at which we setup another opponent by freezing the " + "current main policy and playing against a uniform distribution " + "of previously frozen 'main's from here on.", +) +parser.add_argument( + "--min-league-size", + type=float, + default=3, + help="Minimum number of policies/RLModules to consider the test passed. " + "The initial league size is 2: `main` and `random`. " + "`--min-league-size=3` thus means that one new policy/RLModule has been " + "added so far (b/c the `main` one has reached the `--win-rate-threshold " + "against the `random` Policy/RLModule).", +) +parser.add_argument( + "--num-episodes-human-play", + type=int, + default=10, + help="How many episodes to play against the user on the command " + "line after training has finished.", +) +parser.add_argument( + "--from-checkpoint", + type=str, + default=None, + help="Full path to a checkpoint file for restoring a previously saved " + "Algorithm state.", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + register_env("open_spiel_env", lambda _: OpenSpielEnv(pyspiel.load_game(args.env))) + + def agent_to_module_mapping_fn(agent_id, episode, **kwargs): + # agent_id = [0|1] -> module depends on episode ID + # This way, we make sure that both modules sometimes play agent0 + # (start player) and sometimes agent1 (player to move 2nd). + return "main" if hash(episode.id_) % 2 == agent_id else "random" + + def policy_mapping_fn(agent_id, episode, worker, **kwargs): + return "main" if episode.episode_id % 2 == agent_id else "random" + + config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("open_spiel_env") + # Set up the main piece in this experiment: The league-bases self-play + # callback, which controls adding new policies/Modules to the league and + # properly matching the different policies in the league with each other. + .callbacks( + functools.partial( + ( + SelfPlayCallback + if args.enable_new_api_stack + else SelfPlayCallbackOldAPIStack + ), + win_rate_threshold=args.win_rate_threshold, + ) + ) + .env_runners( + num_env_runners=(args.num_env_runners or 2), + num_envs_per_env_runner=1 if args.enable_new_api_stack else 5, + ) + .multi_agent( + # Initial policy map: Random and default algo one. This will be expanded + # to more policy snapshots taken from "main" against which "main" + # will then play (instead of "random"). This is done in the + # custom callback defined above (`SelfPlayCallback`). + policies=( + { + # Our main policy, we'd like to optimize. + "main": PolicySpec(), + # An initial random opponent to play against. + "random": PolicySpec(policy_class=RandomPolicy), + } + if not args.enable_new_api_stack + else {"main", "random"} + ), + # Assign agent 0 and 1 randomly to the "main" policy or + # to the opponent ("random" at first). Make sure (via episode_id) + # that "main" always plays against "random" (and not against + # another "main"). + policy_mapping_fn=( + agent_to_module_mapping_fn + if args.enable_new_api_stack + else policy_mapping_fn + ), + # Always just train the "main" policy. + policies_to_train=["main"], + ) + .rl_module( + model_config=DefaultModelConfig(fcnet_hiddens=[512, 512]), + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={ + "main": RLModuleSpec(), + "random": RLModuleSpec(module_class=RandomRLModule), + } + ), + ) + ) + + # Only for PPO, change the `num_epochs` setting. + if args.algo == "PPO": + config.training(num_epochs=20) + + stop = { + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + TRAINING_ITERATION: args.stop_iters, + "league_size": args.min_league_size, + } + + # Train the "main" policy to play really well using self-play. + results = None + if not args.from_checkpoint: + results = run_rllib_example_script_experiment( + config, args, stop=stop, keep_ray_up=True + ) + + # Restore trained Algorithm (set to non-explore behavior) and play against + # human on command line. + if args.num_episodes_human_play > 0: + num_episodes = 0 + config.explore = False + algo = config.build() + if args.from_checkpoint: + algo.restore(args.from_checkpoint) + else: + checkpoint = results.get_best_result().checkpoint + if not checkpoint: + raise ValueError("No last checkpoint found in results!") + algo.restore(checkpoint) + + if args.enable_new_api_stack: + rl_module = algo.get_module("main") + + # Play from the command line against the trained agent + # in an actual (non-RLlib-wrapped) open-spiel env. + human_player = 1 + env = Environment(args.env) + + while num_episodes < args.num_episodes_human_play: + print("You play as {}".format("o" if human_player else "x")) + time_step = env.reset() + while not time_step.last(): + player_id = time_step.observations["current_player"] + if player_id == human_player: + action = ask_user_for_action(time_step) + else: + obs = np.array(time_step.observations["info_state"][player_id]) + if args.enable_new_api_stack: + action = np.argmax( + rl_module.forward_inference( + {"obs": torch.from_numpy(obs).unsqueeze(0).float()} + )["action_dist_inputs"][0].numpy() + ) + else: + action = algo.compute_single_action(obs, policy_id="main") + # In case computer chooses an invalid action, pick a + # random one. + legal = time_step.observations["legal_actions"][player_id] + if action not in legal: + action = np.random.choice(legal) + time_step = env.step([action]) + print(f"\n{env.get_state}") + + print(f"\n{env.get_state}") + + print("End of game!") + if time_step.rewards[human_player] > 0: + print("You win") + elif time_step.rewards[human_player] < 0: + print("You lose") + else: + print("Draw") + # Switch order of players. + human_player = 1 - human_player + + num_episodes += 1 + + algo.stop() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py new file mode 100644 index 0000000000000000000000000000000000000000..0981eb2575f10137e9e93edbc4b19dd3f6e57898 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py @@ -0,0 +1,92 @@ +"""The two-step game from the QMIX paper: +https://arxiv.org/pdf/1803.11485.pdf + +See also: rllib/examples/centralized_critic.py for centralized critic PPO on this game. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +Note that in this script, we use an multi-agent environment in which both +agents that normally play this game have been merged into one agent with ID +"agents" and observation- and action-spaces being 2-tupled (1 item for each +agent). The "agents" agent is mapped to the policy with ID "p0". + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +Which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should expect a reward of 8.0 (the max to reach in thie game) eventually +being achieved by a simple PPO policy (no tuning, just using RLlib's default settings): + ++---------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|---------------------------------+------------+-----------------+--------+ +| PPO_grouped_twostep_4354b_00000 | TERMINATED | 127.0.0.1:42602 | 20 | ++---------------------------------+------------+-----------------+--------+ + ++------------------+-------+-------------------+-------------+ +| total time (s) | ts | combined reward | reward p0 | ++------------------+-------+-------------------+-------------| +| 87.5756 | 80000 | 8 | 8 | ++------------------+-------+-------------------+-------------+ +""" + +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.examples.envs.classes.multi_agent.two_step_game import ( + TwoStepGameWithGroupedAgents, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import register_env, get_trainable_cls + + +parser = add_rllib_example_script_args(default_reward=7.0) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert args.num_agents == 2, "Must set --num-agents=2 when running this script!" + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + register_env( + "grouped_twostep", + lambda config: TwoStepGameWithGroupedAgents(config), + ) + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("grouped_twostep") + .env_runners( + env_to_module_connector=lambda env: FlattenObservations(multi_agent=True), + ) + .multi_agent( + policies={"p0"}, + policy_mapping_fn=lambda aid, *a, **kw: "p0", + ) + .rl_module( + rl_module_spec=MultiRLModuleSpec( + rl_module_specs={ + "p0": RLModuleSpec(), + }, + ) + ) + ) + + run_rllib_example_script_experiment(base_config, args) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d2f61ce378bce38dcc3a1a07b33f0df450829168 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__init__.py @@ -0,0 +1,43 @@ +import sys + +from ray.rllib.examples.multi_agent.utils.self_play_callback import SelfPlayCallback +from ray.rllib.examples.multi_agent.utils.self_play_league_based_callback import ( + SelfPlayLeagueBasedCallback, +) +from ray.rllib.examples.multi_agent.utils.self_play_callback_old_api_stack import ( + SelfPlayCallbackOldAPIStack, +) +from ray.rllib.examples.multi_agent.utils.self_play_league_based_callback_old_api_stack import ( # noqa + SelfPlayLeagueBasedCallbackOldAPIStack, +) + + +def ask_user_for_action(time_step): + """Asks the user for a valid action on the command line and returns it. + + Re-queries the user until she picks a valid one. + + Args: + time_step: The open spiel Environment time-step object. + """ + pid = time_step.observations["current_player"] + legal_moves = time_step.observations["legal_actions"][pid] + choice = -1 + while choice not in legal_moves: + print("Choose an action from {}:".format(legal_moves)) + sys.stdout.flush() + choice_str = input() + try: + choice = int(choice_str) + except ValueError: + continue + return choice + + +__all__ = [ + "ask_user_for_action", + "SelfPlayCallback", + "SelfPlayLeagueBasedCallback", + "SelfPlayCallbackOldAPIStack", + "SelfPlayLeagueBasedCallbackOldAPIStack", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b06fa95f029bade71419733b6664cd6295c98ef Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f53b11698ed8954795fea1f603eb5ede88264002 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback_old_api_stack.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback_old_api_stack.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2870188b6274b3146cddf4e95f275e21a42a8994 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_callback_old_api_stack.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8232c1be80a137bde641a02f55fa0b9b89a64c7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback_old_api_stack.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback_old_api_stack.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8398c4ccc930f38594ccbd42ed4185c982848e1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/__pycache__/self_play_league_based_callback_old_api_stack.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback.py new file mode 100644 index 0000000000000000000000000000000000000000..436c3c2d198266b1e6faa1d283ffc67e622154e6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback.py @@ -0,0 +1,96 @@ +from collections import defaultdict + +import numpy as np + +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS + + +class SelfPlayCallback(RLlibCallback): + def __init__(self, win_rate_threshold): + super().__init__() + # 0=RandomPolicy, 1=1st main policy snapshot, + # 2=2nd main policy snapshot, etc.. + self.current_opponent = 0 + + self.win_rate_threshold = win_rate_threshold + + # Report the matchup counters (who played against whom?). + self._matching_stats = defaultdict(int) + + def on_episode_end( + self, + *, + episode, + env_runner, + metrics_logger, + env, + env_index, + rl_module, + **kwargs, + ) -> None: + # Compute the win rate for this episode and log it with a window of 100. + main_agent = 0 if episode.module_for(0) == "main" else 1 + rewards = episode.get_rewards() + if main_agent in rewards: + main_won = rewards[main_agent][-1] == 1.0 + metrics_logger.log_value( + "win_rate", + main_won, + window=100, + ) + + def on_train_result(self, *, algorithm, metrics_logger=None, result, **kwargs): + win_rate = result[ENV_RUNNER_RESULTS]["win_rate"] + print(f"Iter={algorithm.iteration} win-rate={win_rate} -> ", end="") + # If win rate is good -> Snapshot current policy and play against + # it next, keeping the snapshot fixed and only improving the "main" + # policy. + if win_rate > self.win_rate_threshold: + self.current_opponent += 1 + new_module_id = f"main_v{self.current_opponent}" + print(f"adding new opponent to the mix ({new_module_id}).") + + # Re-define the mapping function, such that "main" is forced + # to play against any of the previously played modules + # (excluding "random"). + def agent_to_module_mapping_fn(agent_id, episode, **kwargs): + # agent_id = [0|1] -> policy depends on episode ID + # This way, we make sure that both modules sometimes play + # (start player) and sometimes agent1 (player to move 2nd). + opponent = "main_v{}".format( + np.random.choice(list(range(1, self.current_opponent + 1))) + ) + if hash(episode.id_) % 2 == agent_id: + self._matching_stats[("main", opponent)] += 1 + return "main" + else: + return opponent + + main_module = algorithm.get_module("main") + algorithm.add_module( + module_id=new_module_id, + module_spec=RLModuleSpec.from_module(main_module), + new_agent_to_module_mapping_fn=agent_to_module_mapping_fn, + ) + # TODO (sven): Maybe we should move this convenience step back into + # `Algorithm.add_module()`? Would be less explicit, but also easier. + algorithm.set_state( + { + "learner_group": { + "learner": { + "rl_module": { + new_module_id: main_module.get_state(), + } + } + } + } + ) + else: + print("not good enough; will keep learning ...") + + # +2 = main + random + result["league_size"] = self.current_opponent + 2 + + print(f"Matchups:\n{self._matching_stats}") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py new file mode 100644 index 0000000000000000000000000000000000000000..42b05b94501708de34f3653150e384b3c95f4818 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_callback_old_api_stack.py @@ -0,0 +1,78 @@ +import numpy as np + +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.utils.deprecation import Deprecated +from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS + + +@Deprecated(help="Use the example for the new RLlib API stack.", error=False) +class SelfPlayCallbackOldAPIStack(RLlibCallback): + def __init__(self, win_rate_threshold): + super().__init__() + # 0=RandomPolicy, 1=1st main policy snapshot, + # 2=2nd main policy snapshot, etc.. + self.current_opponent = 0 + + self.win_rate_threshold = win_rate_threshold + + def on_train_result(self, *, algorithm, result, **kwargs): + # Get the win rate for the train batch. + # Note that normally, you should set up a proper evaluation config, + # such that evaluation always happens on the already updated policy, + # instead of on the already used train_batch. + main_rew = result[ENV_RUNNER_RESULTS]["hist_stats"].pop("policy_main_reward") + opponent_rew = list(result[ENV_RUNNER_RESULTS]["hist_stats"].values())[0] + assert len(main_rew) == len(opponent_rew) + won = 0 + for r_main, r_opponent in zip(main_rew, opponent_rew): + if r_main > r_opponent: + won += 1 + win_rate = won / len(main_rew) + result["win_rate"] = win_rate + print(f"Iter={algorithm.iteration} win-rate={win_rate} -> ", end="") + # If win rate is good -> Snapshot current policy and play against + # it next, keeping the snapshot fixed and only improving the "main" + # policy. + if win_rate > self.win_rate_threshold: + self.current_opponent += 1 + new_pol_id = f"main_v{self.current_opponent}" + print(f"adding new opponent to the mix ({new_pol_id}).") + + # Re-define the mapping function, such that "main" is forced + # to play against any of the previously played policies + # (excluding "random"). + def policy_mapping_fn(agent_id, episode, worker, **kwargs): + # agent_id = [0|1] -> policy depends on episode ID + # This way, we make sure that both policies sometimes play + # (start player) and sometimes agent1 (player to move 2nd). + return ( + "main" + if episode.episode_id % 2 == agent_id + else "main_v{}".format( + np.random.choice(list(range(1, self.current_opponent + 1))) + ) + ) + + main_policy = algorithm.get_policy("main") + new_policy = algorithm.add_policy( + policy_id=new_pol_id, + policy_cls=type(main_policy), + policy_mapping_fn=policy_mapping_fn, + config=main_policy.config, + observation_space=main_policy.observation_space, + action_space=main_policy.action_space, + ) + + # Set the weights of the new policy to the main policy. + # We'll keep training the main policy, whereas `new_pol_id` will + # remain fixed. + main_state = main_policy.get_state() + new_policy.set_state(main_state) + # We need to sync the just copied local weights (from main policy) + # to all the remote workers as well. + algorithm.env_runner_group.sync_weights() + else: + print("not good enough; will keep learning ...") + + # +2 = main + random + result["league_size"] = self.current_opponent + 2 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback.py new file mode 100644 index 0000000000000000000000000000000000000000..0b46dc29c5aee5ccc93a036d54cf5b65c304046f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback.py @@ -0,0 +1,275 @@ +from collections import defaultdict +from pprint import pprint +import re + +import numpy as np + +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.core.rl_module.rl_module import RLModuleSpec +from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS + + +class SelfPlayLeagueBasedCallback(RLlibCallback): + def __init__(self, win_rate_threshold): + super().__init__() + # All policies in the league. + self.main_policies = {"main", "main_0"} + self.main_exploiters = {"main_exploiter_0", "main_exploiter_1"} + self.league_exploiters = {"league_exploiter_0", "league_exploiter_1"} + # Set of currently trainable policies in the league. + self.trainable_policies = {"main"} + # Set of currently non-trainable (frozen) policies in the league. + self.non_trainable_policies = { + "main_0", + "league_exploiter_0", + "main_exploiter_0", + } + # The win-rate value reaching of which leads to a new module being added + # to the leage (frozen copy of main). + self.win_rate_threshold = win_rate_threshold + # Store the win rates for league overview printouts. + self.win_rates = {} + + # Report the matchup counters (who played against whom?). + self._matching_stats = defaultdict(int) + + def on_episode_end( + self, + *, + episode, + env_runner, + metrics_logger, + env, + env_index, + rl_module, + **kwargs, + ) -> None: + num_learning_policies = ( + episode.module_for(0) in env_runner.config.policies_to_train + ) + (episode.module_for(1) in env_runner.config.policies_to_train) + # Make sure the mapping function doesn't match two non-trainables together. + # This would be a waste of EnvRunner resources. + # assert num_learning_policies > 0 + # Ignore matches between two learning policies and don't count win-rates for + # these. + assert num_learning_policies > 0, ( + f"agent=0 -> mod={episode.module_for(0)}; " + f"agent=1 -> mod={episode.module_for(1)}; " + f"EnvRunner.config.policies_to_train={env_runner.config.policies_to_train}" + ) + if num_learning_policies == 1: + # Compute the win rate for this episode (only looking at non-trained + # opponents, such as random or frozen policies) and log it with some window. + rewards_dict = episode.get_rewards() + for aid, rewards in rewards_dict.items(): + mid = episode.module_for(aid) + won = rewards[-1] == 1.0 + metrics_logger.log_value( + f"win_rate_{mid}", + won, + window=100, + ) + + def on_train_result(self, *, algorithm, metrics_logger=None, result, **kwargs): + local_worker = algorithm.env_runner + + # Avoid `self` being pickled into the remote function below. + _trainable_policies = self.trainable_policies + + # Get the win rate for the train batch. + # Note that normally, one should set up a proper evaluation config, + # such that evaluation always happens on the already updated policy, + # instead of on the already used train_batch. + league_changed = False + keys = [ + k for k in result[ENV_RUNNER_RESULTS].keys() if k.startswith("win_rate_") + ] + for key in keys: + module_id = key[9:] + self.win_rates[module_id] = result[ENV_RUNNER_RESULTS][key] + + # Policy is frozen; ignore. + if module_id in self.non_trainable_policies: + continue + + print( + f"Iter={algorithm.iteration} {module_id}'s " + f"win-rate={self.win_rates[module_id]} -> ", + end="", + ) + + # If win rate is good -> Snapshot current policy and decide, + # whether to freeze the copy or not. + if self.win_rates[module_id] > self.win_rate_threshold: + is_main = re.match("^main(_\\d+)?$", module_id) + initializing_exploiters = False + + # First time, main manages a decent win-rate against random: + # Add league_exploiter_1 and main_exploiter_1 as trainables to the mix. + if is_main and len(self.trainable_policies) == 1: + initializing_exploiters = True + self.trainable_policies.add("league_exploiter_1") + self.trainable_policies.add("main_exploiter_1") + # If main manages to win (above threshold) against the entire league + # -> increase the league by another frozen copy of main, + # main-exploiters or league-exploiters. + else: + keep_training = ( + False + if is_main + else np.random.choice([True, False], p=[0.3, 0.7]) + ) + if module_id in self.main_policies: + new_mod_id = re.sub( + "(main)(_\\d+)?$", + f"\\1_{len(self.main_policies) - 1}", + module_id, + ) + self.main_policies.add(new_mod_id) + elif module_id in self.main_exploiters: + new_mod_id = re.sub( + "_\\d+$", f"_{len(self.main_exploiters)}", module_id + ) + self.main_exploiters.add(new_mod_id) + else: + new_mod_id = re.sub( + "_\\d+$", f"_{len(self.league_exploiters)}", module_id + ) + self.league_exploiters.add(new_mod_id) + + if keep_training: + self.trainable_policies.add(new_mod_id) + else: + self.non_trainable_policies.add(new_mod_id) + + print(f"adding new opponents to the mix ({new_mod_id}).") + + # Update our mapping function accordingly. + def agent_to_module_mapping_fn(agent_id, episode, **kwargs): + # Pick, whether this is ... + type_ = np.random.choice([1, 2]) + + # 1) League exploiter vs any other. + if type_ == 1: + league_exploiter = "league_exploiter_" + str( + np.random.choice(list(range(len(self.league_exploiters)))) + ) + # This league exploiter is frozen: Play against a + # trainable policy. + if league_exploiter not in self.trainable_policies: + opponent = np.random.choice(list(self.trainable_policies)) + # League exploiter is trainable: Play against any other + # non-trainable policy. + else: + opponent = np.random.choice( + list(self.non_trainable_policies) + ) + + # Only record match stats once per match. + if hash(episode.id_) % 2 == agent_id: + self._matching_stats[(league_exploiter, opponent)] += 1 + return league_exploiter + else: + return opponent + + # 2) Main exploiter vs main. + else: + main_exploiter = "main_exploiter_" + str( + np.random.choice(list(range(len(self.main_exploiters)))) + ) + # Main exploiter is frozen: Play against the main + # policy. + if main_exploiter not in self.trainable_policies: + main = "main" + # Main exploiter is trainable: Play against any + # frozen main. + else: + main = np.random.choice(list(self.main_policies - {"main"})) + + # Only record match stats once per match. + if hash(episode.id_) % 2 == agent_id: + self._matching_stats[(main_exploiter, main)] += 1 + return main_exploiter + else: + return main + + multi_rl_module = local_worker.module + main_module = multi_rl_module["main"] + + # Set the weights of the new polic(y/ies). + if initializing_exploiters: + main_state = main_module.get_state() + multi_rl_module["main_0"].set_state(main_state) + multi_rl_module["league_exploiter_1"].set_state(main_state) + multi_rl_module["main_exploiter_1"].set_state(main_state) + # We need to sync the just copied local weights to all the + # remote workers and remote Learner workers as well. + algorithm.env_runner_group.sync_weights( + policies=["main_0", "league_exploiter_1", "main_exploiter_1"] + ) + algorithm.learner_group.set_weights(multi_rl_module.get_state()) + else: + algorithm.add_module( + module_id=new_mod_id, + module_spec=RLModuleSpec.from_module(main_module), + ) + # TODO (sven): Maybe we should move this convenience step back into + # `Algorithm.add_module()`? Would be less explicit, but also + # easier. + algorithm.set_state( + { + "learner_group": { + "learner": { + "rl_module": { + new_mod_id: multi_rl_module[ + module_id + ].get_state(), + } + } + } + } + ) + + algorithm.env_runner_group.foreach_env_runner( + lambda env_runner: env_runner.config.multi_agent( + policy_mapping_fn=agent_to_module_mapping_fn, + # This setting doesn't really matter for EnvRunners (no + # training going on there, but we'll update this as well + # here for good measure). + policies_to_train=_trainable_policies, + ), + local_env_runner=True, + ) + # Set all Learner workers' should_module_be_updated to the new + # value. + algorithm.learner_group.foreach_learner( + func=lambda learner: learner.config.multi_agent( + policies_to_train=_trainable_policies, + ), + timeout_seconds=0.0, # fire-and-forget + ) + league_changed = True + else: + print("not good enough; will keep learning ...") + + # Add current league size to results dict. + result["league_size"] = len(self.non_trainable_policies) + len( + self.trainable_policies + ) + + if league_changed: + self._print_league() + + def _print_league(self): + print("--- League ---") + print("Matchups:") + pprint(self._matching_stats) + print("Trainable policies (win-rates):") + for p in sorted(self.trainable_policies): + wr = self.win_rates[p] if p in self.win_rates else 0.0 + print(f"\t{p}: {wr}") + print("Frozen policies:") + for p in sorted(self.non_trainable_policies): + wr = self.win_rates[p] if p in self.win_rates else 0.0 + print(f"\t{p}: {wr}") + print() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py new file mode 100644 index 0000000000000000000000000000000000000000..dc39fa8fac9a7ba658aefdd2edeb4199097aaa2b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/examples/multi_agent/utils/self_play_league_based_callback_old_api_stack.py @@ -0,0 +1,201 @@ +import re + +import numpy as np + +from ray.rllib.callbacks.callbacks import RLlibCallback +from ray.rllib.utils.deprecation import Deprecated +from ray.rllib.utils.metrics import ENV_RUNNER_RESULTS + + +@Deprecated(help="Use the example for the new RLlib API stack", error=False) +class SelfPlayLeagueBasedCallbackOldAPIStack(RLlibCallback): + def __init__(self, win_rate_threshold): + super().__init__() + # All policies in the league. + self.main_policies = {"main", "main_0"} + self.main_exploiters = {"main_exploiter_0", "main_exploiter_1"} + self.league_exploiters = {"league_exploiter_0", "league_exploiter_1"} + # Set of currently trainable policies in the league. + self.trainable_policies = {"main"} + # Set of currently non-trainable (frozen) policies in the league. + self.non_trainable_policies = { + "main_0", + "league_exploiter_0", + "main_exploiter_0", + } + # The win-rate value reaching of which leads to a new module being added + # to the leage (frozen copy of main). + self.win_rate_threshold = win_rate_threshold + # Store the win rates for league overview printouts. + self.win_rates = {} + + def on_train_result(self, *, algorithm, result, **kwargs): + # Avoid `self` being pickled into the remote function below. + _trainable_policies = self.trainable_policies + + # Get the win rate for the train batch. + # Note that normally, you should set up a proper evaluation config, + # such that evaluation always happens on the already updated policy, + # instead of on the already used train_batch. + for policy_id, rew in result[ENV_RUNNER_RESULTS]["hist_stats"].items(): + mo = re.match("^policy_(.+)_reward$", policy_id) + if mo is None: + continue + policy_id = mo.group(1) + + # Calculate this policy's win rate. + won = 0 + for r in rew: + if r > 0.0: # win = 1.0; loss = -1.0 + won += 1 + win_rate = won / len(rew) + self.win_rates[policy_id] = win_rate + + # Policy is frozen; ignore. + if policy_id in self.non_trainable_policies: + continue + + print( + f"Iter={algorithm.iteration} {policy_id}'s " f"win-rate={win_rate} -> ", + end="", + ) + + # If win rate is good -> Snapshot current policy and decide, + # whether to freeze the copy or not. + if win_rate > self.win_rate_threshold: + is_main = re.match("^main(_\\d+)?$", policy_id) + initializing_exploiters = False + + # First time, main manages a decent win-rate against random: + # Add league_exploiter_0 and main_exploiter_0 to the mix. + if is_main and len(self.trainable_policies) == 1: + initializing_exploiters = True + self.trainable_policies.add("league_exploiter_0") + self.trainable_policies.add("main_exploiter_0") + else: + keep_training = ( + False + if is_main + else np.random.choice([True, False], p=[0.3, 0.7]) + ) + if policy_id in self.main_policies: + new_pol_id = re.sub( + "_\\d+$", f"_{len(self.main_policies) - 1}", policy_id + ) + self.main_policies.add(new_pol_id) + elif policy_id in self.main_exploiters: + new_pol_id = re.sub( + "_\\d+$", f"_{len(self.main_exploiters)}", policy_id + ) + self.main_exploiters.add(new_pol_id) + else: + new_pol_id = re.sub( + "_\\d+$", f"_{len(self.league_exploiters)}", policy_id + ) + self.league_exploiters.add(new_pol_id) + + if keep_training: + self.trainable_policies.add(new_pol_id) + else: + self.non_trainable_policies.add(new_pol_id) + + print(f"adding new opponents to the mix ({new_pol_id}).") + + # Update our mapping function accordingly. + def policy_mapping_fn(agent_id, episode, worker=None, **kwargs): + # Pick, whether this is ... + type_ = np.random.choice([1, 2]) + + # 1) League exploiter vs any other. + if type_ == 1: + league_exploiter = "league_exploiter_" + str( + np.random.choice(list(range(len(self.league_exploiters)))) + ) + # This league exploiter is frozen: Play against a + # trainable policy. + if league_exploiter not in self.trainable_policies: + opponent = np.random.choice(list(self.trainable_policies)) + # League exploiter is trainable: Play against any other + # non-trainable policy. + else: + opponent = np.random.choice( + list(self.non_trainable_policies) + ) + print(f"{league_exploiter} vs {opponent}") + return ( + league_exploiter + if episode.episode_id % 2 == agent_id + else opponent + ) + + # 2) Main exploiter vs main. + else: + main_exploiter = "main_exploiter_" + str( + np.random.choice(list(range(len(self.main_exploiters)))) + ) + # Main exploiter is frozen: Play against the main + # policy. + if main_exploiter not in self.trainable_policies: + main = "main" + # Main exploiter is trainable: Play against any + # frozen main. + else: + main = np.random.choice(list(self.main_policies - {"main"})) + # print(f"{main_exploiter} vs {main}") + return ( + main_exploiter + if episode.episode_id % 2 == agent_id + else main + ) + + # Set the weights of the new polic(y/ies). + if initializing_exploiters: + main_state = algorithm.get_policy("main").get_state() + pol_map = algorithm.env_runner.policy_map + pol_map["main_0"].set_state(main_state) + pol_map["league_exploiter_1"].set_state(main_state) + pol_map["main_exploiter_1"].set_state(main_state) + # We need to sync the just copied local weights to all the + # remote workers as well. + algorithm.env_runner_group.sync_weights( + policies=["main_0", "league_exploiter_1", "main_exploiter_1"] + ) + + def _set(worker): + worker.set_policy_mapping_fn(policy_mapping_fn) + worker.set_is_policy_to_train(_trainable_policies) + + algorithm.env_runner_group.foreach_env_runner(_set) + else: + base_pol = algorithm.get_policy(policy_id) + new_policy = algorithm.add_policy( + policy_id=new_pol_id, + policy_cls=type(base_pol), + policy_mapping_fn=policy_mapping_fn, + policies_to_train=self.trainable_policies, + config=base_pol.config, + observation_space=base_pol.observation_space, + action_space=base_pol.action_space, + ) + main_state = base_pol.get_state() + new_policy.set_state(main_state) + # We need to sync the just copied local weights to all the + # remote workers as well. + algorithm.env_runner_group.sync_weights(policies=[new_pol_id]) + + self._print_league() + + else: + print("not good enough; will keep learning ...") + + def _print_league(self): + print("--- League ---") + print("Trainable policies (win-rates):") + for p in sorted(self.trainable_policies): + wr = self.win_rates[p] if p in self.win_rates else 0.0 + print(f"\t{p}: {wr}") + print("Frozen policies:") + for p in sorted(self.non_trainable_policies): + wr = self.win_rates[p] if p in self.win_rates else 0.0 + print(f"\t{p}: {wr}") + print() diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..084ee7f6c3197a12922b22b23d82fa304eac2f65 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/examples/offline_rl/classes/__pycache__/__init__.cpython-311.pyc differ diff --git a/llm_tutorial/llm_recipes/models/hf-model-eval/llm-jp-v3-3.7b_ja-en2en-ja_3M-pairs_3.5e-5/iter_0000698/model-00004-of-00004.safetensors b/llm_tutorial/llm_recipes/models/hf-model-eval/llm-jp-v3-3.7b_ja-en2en-ja_3M-pairs_3.5e-5/iter_0000698/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..430b574cafc8878c463b93236d0d80cf97902bd6 --- /dev/null +++ b/llm_tutorial/llm_recipes/models/hf-model-eval/llm-jp-v3-3.7b_ja-en2en-ja_3M-pairs_3.5e-5/iter_0000698/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7115b612a01a71d23727b49a7377bafa5f567802b8024bdd5f5a227af192a3ab +size 1223688320